/**
 * This file is part of ORB-SLAM.
 * It is based on the file orb.cpp from the OpenCV library (see BSD license below).
 *
 * Copyright (C) 2014 Raúl Mur-Artal <raulmur at unizar dot es> (University of Zaragoza)
 * For more information see <http://webdiis.unizar.es/~raulmur/orbslam/>
 *
 * ORB-SLAM is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * ORB-SLAM is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with ORB-SLAM. If not, see <http://www.gnu.org/licenses/>.
 */

/*********************************************************************
 * Software License Agreement (BSD License)
 *
 *  Copyright (c) 2009, Willow Garage, Inc.
 *  All rights reserved.
 *
 *  Redistribution and use in source and binary forms, with or without
 *  modification, are permitted provided that the following conditions
 *  are met:
 *
 *   * Redistributions of source code must retain the above copyright
 *     notice, this list of conditions and the following disclaimer.
 *   * Redistributions in binary form must reproduce the above
 *     copyright notice, this list of conditions and the following
 *     disclaimer in the documentation and/or other materials provided
 *     with the distribution.
 *   * Neither the name of the Willow Garage nor the names of its
 *     contributors may be used to endorse or promote products derived
 *     from this software without specific prior written permission.
 *
 *  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *  FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *  COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *  POSSIBILITY OF SUCH DAMAGE.
 *
 *********************************************************************/
#include "ORBextractor.h"
#include <stdlib.h>    // NOLINT
#include <glog/logging.h>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/features2d.hpp>
#include <vector>

#include <iostream>
#ifdef __ARM_NEON__
#include <arm_neon.h>
#define NEON
#endif


using cv::Mat;
using cv::Point;
using cv::KeyPoint;
using cv::AutoBuffer;
using cv::Point2f;
using cv::Rect;
using cv::FastFeatureDetector;
using cv::KeyPointsFilter;
using cv::InputArray;
using cv::OutputArray;
using cv::ORB;
using cv::Size;
using cv::BORDER_REFLECT_101;
using cv::INTER_NEAREST;
using cv::BORDER_ISOLATED;
using cv::INTER_LINEAR;
using std::vector;


static const ORBextractor::SinCosAngleVal sin_cos_0_360_deg_look_up[361] = {
    {0.0000000000,  1.0000000000},   // 0  deg
    {0.0174524064,  0.9998476952},   // 1  deg
    {0.0348994967,  0.9993908270},   // 2  deg
    {0.0523359562,  0.9986295348},   // 3  deg
    {0.0697564737,  0.9975640503},   // 4  deg
    {0.0871557427,  0.9961946981},   // 5  deg
    {0.1045284633,  0.9945218954},   // 6  deg
    {0.1218693434,  0.9925461516},   // 7  deg
    {0.1391731010,  0.9902680687},   // 8  deg
    {0.1564344650,  0.9876883406},   // 9  deg
    {0.1736481777,  0.9848077530},   // 10 deg
    {0.1908089954,  0.9816271834},   // 11 deg
    {0.2079116908,  0.9781476007},   // 12 deg
    {0.2249510543,  0.9743700648},   // 13 deg
    {0.2419218956,  0.9702957263},   // 14 deg
    {0.2588190451,  0.9659258263},   // 15 deg
    {0.2756373558,  0.9612616959},   // 16 deg
    {0.2923717047,  0.9563047560},   // 17 deg
    {0.3090169944,  0.9510565163},   // 18 deg
    {0.3255681545,  0.9455185756},   // 19 deg
    {0.3420201433,  0.9396926208},   // 20 deg
    {0.3583679495,  0.9335804265},   // 21 deg
    {0.3746065934,  0.9271838546},   // 22 deg
    {0.3907311285,  0.9205048535},   // 23 deg
    {0.4067366431,  0.9135454576},   // 24 deg
    {0.4226182617,  0.9063077870},   // 25 deg
    {0.4383711468,  0.8987940463},   // 26 deg
    {0.4539904997,  0.8910065242},   // 27 deg
    {0.4694715628,  0.8829475929},   // 28 deg
    {0.4848096202,  0.8746197071},   // 29 deg
    {0.5000000000,  0.8660254038},   // 30 deg
    {0.5150380749,  0.8571673007},   // 31 deg
    {0.5299192642,  0.8480480962},   // 32 deg
    {0.5446390350,  0.8386705679},   // 33 deg
    {0.5591929035,  0.8290375726},   // 34 deg
    {0.5735764364,  0.8191520443},   // 35 deg
    {0.5877852523,  0.8090169944},   // 36 deg
    {0.6018150232,  0.7986355100},   // 37 deg
    {0.6156614753,  0.7880107536},   // 38 deg
    {0.6293203910,  0.7771459615},   // 39 deg
    {0.6427876097,  0.7660444431},   // 40 deg
    {0.6560590290,  0.7547095802},   // 41 deg
    {0.6691306064,  0.7431448255},   // 42 deg
    {0.6819983601,  0.7313537016},   // 43 deg
    {0.6946583705,  0.7193398003},   // 44 deg
    {0.7071067812,  0.7071067812},   // 45 deg
    {0.7193398003,  0.6946583705},   // 46 deg
    {0.7313537016,  0.6819983601},   // 47 deg
    {0.7431448255,  0.6691306064},   // 48 deg
    {0.7547095802,  0.6560590290},   // 49 deg
    {0.7660444431,  0.6427876097},   // 50 deg
    {0.7771459615,  0.6293203910},   // 51 deg
    {0.7880107536,  0.6156614753},   // 52 deg
    {0.7986355100,  0.6018150232},   // 53 deg
    {0.8090169944,  0.5877852523},   // 54 deg
    {0.8191520443,  0.5735764364},   // 55 deg
    {0.8290375726,  0.5591929035},   // 56 deg
    {0.8386705679,  0.5446390350},   // 57 deg
    {0.8480480962,  0.5299192642},   // 58 deg
    {0.8571673007,  0.5150380749},   // 59 deg
    {0.8660254038,  0.5000000000},   // 60 deg
    {0.8746197071,  0.4848096202},   // 61 deg
    {0.8829475929,  0.4694715628},   // 62 deg
    {0.8910065242,  0.4539904997},   // 63 deg
    {0.8987940463,  0.4383711468},   // 64 deg
    {0.9063077870,  0.4226182617},   // 65 deg
    {0.9135454576,  0.4067366431},   // 66 deg
    {0.9205048535,  0.3907311285},   // 67 deg
    {0.9271838546,  0.3746065934},   // 68 deg
    {0.9335804265,  0.3583679495},   // 69 deg
    {0.9396926208,  0.3420201433},   // 70 deg
    {0.9455185756,  0.3255681545},   // 71 deg
    {0.9510565163,  0.3090169944},   // 72 deg
    {0.9563047560,  0.2923717047},   // 73 deg
    {0.9612616959,  0.2756373558},   // 74 deg
    {0.9659258263,  0.2588190451},   // 75 deg
    {0.9702957263,  0.2419218956},   // 76 deg
    {0.9743700648,  0.2249510543},   // 77 deg
    {0.9781476007,  0.2079116908},   // 78 deg
    {0.9816271834,  0.1908089954},   // 79 deg
    {0.9848077530,  0.1736481777},   // 80 deg
    {0.9876883406,  0.1564344650},   // 81 deg
    {0.9902680687,  0.1391731010},   // 82 deg
    {0.9925461516,  0.1218693434},   // 83 deg
    {0.9945218954,  0.1045284633},   // 84 deg
    {0.9961946981,  0.0871557427},   // 85 deg
    {0.9975640503,  0.0697564737},   // 86 deg
    {0.9986295348,  0.0523359562},   // 87 deg
    {0.9993908270,  0.0348994967},   // 88 deg
    {0.9998476952,  0.0174524064},   // 89 deg
    {1.0000000000,  0.0000000000},   // 90 deg
    {0.9998476952,  -0.0174524064},  // 91 deg
    {0.9993908270,  -0.0348994967},  // 92 deg
    {0.9986295348,  -0.0523359562},  // 93 deg
    {0.9975640503,  -0.0697564737},  // 94 deg
    {0.9961946981,  -0.0871557427},  // 95 deg
    {0.9945218954,  -0.1045284633},  // 96 deg
    {0.9925461516,  -0.1218693434},  // 97 deg
    {0.9902680687,  -0.1391731010},  // 98 deg
    {0.9876883406,  -0.1564344650},  // 99 deg
    {0.9848077530,  -0.1736481777},  // 100 deg
    {0.9816271834,  -0.1908089954},  // 101 deg
    {0.9781476007,  -0.2079116908},  // 102 deg
    {0.9743700648,  -0.2249510543},  // 103 deg
    {0.9702957263,  -0.2419218956},  // 104 deg
    {0.9659258263,  -0.2588190451},  // 105 deg
    {0.9612616959,  -0.2756373558},  // 106 deg
    {0.9563047560,  -0.2923717047},  // 107 deg
    {0.9510565163,  -0.3090169944},  // 108 deg
    {0.9455185756,  -0.3255681545},  // 109 deg
    {0.9396926208,  -0.3420201433},  // 110 deg
    {0.9335804265,  -0.3583679495},  // 111 deg
    {0.9271838546,  -0.3746065934},  // 112 deg
    {0.9205048535,  -0.3907311285},  // 113 deg
    {0.9135454576,  -0.4067366431},  // 114 deg
    {0.9063077870,  -0.4226182617},  // 115 deg
    {0.8987940463,  -0.4383711468},  // 116 deg
    {0.8910065242,  -0.4539904997},  // 117 deg
    {0.8829475929,  -0.4694715628},  // 118 deg
    {0.8746197071,  -0.4848096202},  // 119 deg
    {0.8660254038,  -0.5000000000},  // 120 deg
    {0.8571673007,  -0.5150380749},  // 121 deg
    {0.8480480962,  -0.5299192642},  // 122 deg
    {0.8386705679,  -0.5446390350},  // 123 deg
    {0.8290375726,  -0.5591929035},  // 124 deg
    {0.8191520443,  -0.5735764364},  // 125 deg
    {0.8090169944,  -0.5877852523},  // 126 deg
    {0.7986355100,  -0.6018150232},  // 127 deg
    {0.7880107536,  -0.6156614753},  // 128 deg
    {0.7771459615,  -0.6293203910},  // 129 deg
    {0.7660444431,  -0.6427876097},  // 130 deg
    {0.7547095802,  -0.6560590290},  // 131 deg
    {0.7431448255,  -0.6691306064},  // 132 deg
    {0.7313537016,  -0.6819983601},  // 133 deg
    {0.7193398003,  -0.6946583705},  // 134 deg
    {0.7071067812,  -0.7071067812},  // 135 deg
    {0.6946583705,  -0.7193398003},  // 136 deg
    {0.6819983601,  -0.7313537016},  // 137 deg
    {0.6691306064,  -0.7431448255},  // 138 deg
    {0.6560590290,  -0.7547095802},  // 139 deg
    {0.6427876097,  -0.7660444431},  // 140 deg
    {0.6293203910,  -0.7771459615},  // 141 deg
    {0.6156614753,  -0.7880107536},  // 142 deg
    {0.6018150232,  -0.7986355100},  // 143 deg
    {0.5877852523,  -0.8090169944},  // 144 deg
    {0.5735764364,  -0.8191520443},  // 145 deg
    {0.5591929035,  -0.8290375726},  // 146 deg
    {0.5446390350,  -0.8386705679},  // 147 deg
    {0.5299192642,  -0.8480480962},  // 148 deg
    {0.5150380749,  -0.8571673007},  // 149 deg
    {0.5000000000,  -0.8660254038},  // 150 deg
    {0.4848096202,  -0.8746197071},  // 151 deg
    {0.4694715628,  -0.8829475929},  // 152 deg
    {0.4539904997,  -0.8910065242},  // 153 deg
    {0.4383711468,  -0.8987940463},  // 154 deg
    {0.4226182617,  -0.9063077870},  // 155 deg
    {0.4067366431,  -0.9135454576},  // 156 deg
    {0.3907311285,  -0.9205048535},  // 157 deg
    {0.3746065934,  -0.9271838546},  // 158 deg
    {0.3583679495,  -0.9335804265},  // 159 deg
    {0.3420201433,  -0.9396926208},  // 160 deg
    {0.3255681545,  -0.9455185756},  // 161 deg
    {0.3090169944,  -0.9510565163},  // 162 deg
    {0.2923717047,  -0.9563047560},  // 163 deg
    {0.2756373558,  -0.9612616959},  // 164 deg
    {0.2588190451,  -0.9659258263},  // 165 deg
    {0.2419218956,  -0.9702957263},  // 166 deg
    {0.2249510543,  -0.9743700648},  // 167 deg
    {0.2079116908,  -0.9781476007},  // 168 deg
    {0.1908089954,  -0.9816271834},  // 169 deg
    {0.1736481777,  -0.9848077530},  // 170 deg
    {0.1564344650,  -0.9876883406},  // 171 deg
    {0.1391731010,  -0.9902680687},  // 172 deg
    {0.1218693434,  -0.9925461516},  // 173 deg
    {0.1045284633,  -0.9945218954},  // 174 deg
    {0.0871557427,  -0.9961946981},  // 175 deg
    {0.0697564737,  -0.9975640503},  // 176 deg
    {0.0523359562,  -0.9986295348},  // 177 deg
    {0.0348994967,  -0.9993908270},  // 178 deg
    {0.0174524064,  -0.9998476952},  // 179 deg
    {0.0000000000,  -1.0000000000},  // 180 deg
    {-0.0174524064, -0.9998476952},  // 181 deg
    {-0.0348994967, -0.9993908270},  // 182 deg
    {-0.0523359562, -0.9986295348},  // 183 deg
    {-0.0697564737, -0.9975640503},  // 184 deg
    {-0.0871557427, -0.9961946981},  // 185 deg
    {-0.1045284633, -0.9945218954},  // 186 deg
    {-0.1218693434, -0.9925461516},  // 187 deg
    {-0.1391731010, -0.9902680687},  // 188 deg
    {-0.1564344650, -0.9876883406},  // 189 deg
    {-0.1736481777, -0.9848077530},  // 190 deg
    {-0.1908089954, -0.9816271834},  // 191 deg
    {-0.2079116908, -0.9781476007},  // 192 deg
    {-0.2249510543, -0.9743700648},  // 193 deg
    {-0.2419218956, -0.9702957263},  // 194 deg
    {-0.2588190451, -0.9659258263},  // 195 deg
    {-0.2756373558, -0.9612616959},  // 196 deg
    {-0.2923717047, -0.9563047560},  // 197 deg
    {-0.3090169944, -0.9510565163},  // 198 deg
    {-0.3255681545, -0.9455185756},  // 199 deg
    {-0.3420201433, -0.9396926208},  // 200 deg
    {-0.3583679495, -0.9335804265},  // 201 deg
    {-0.3746065934, -0.9271838546},  // 202 deg
    {-0.3907311285, -0.9205048535},  // 203 deg
    {-0.4067366431, -0.9135454576},  // 204 deg
    {-0.4226182617, -0.9063077870},  // 205 deg
    {-0.4383711468, -0.8987940463},  // 206 deg
    {-0.4539904997, -0.8910065242},  // 207 deg
    {-0.4694715628, -0.8829475929},  // 208 deg
    {-0.4848096202, -0.8746197071},  // 209 deg
    {-0.5000000000, -0.8660254038},  // 210 deg
    {-0.5150380749, -0.8571673007},  // 211 deg
    {-0.5299192642, -0.8480480962},  // 212 deg
    {-0.5446390350, -0.8386705679},  // 213 deg
    {-0.5591929035, -0.8290375726},  // 214 deg
    {-0.5735764364, -0.8191520443},  // 215 deg
    {-0.5877852523, -0.8090169944},  // 216 deg
    {-0.6018150232, -0.7986355100},  // 217 deg
    {-0.6156614753, -0.7880107536},  // 218 deg
    {-0.6293203910, -0.7771459615},  // 219 deg
    {-0.6427876097, -0.7660444431},  // 220 deg
    {-0.6560590290, -0.7547095802},  // 221 deg
    {-0.6691306064, -0.7431448255},  // 222 deg
    {-0.6819983601, -0.7313537016},  // 223 deg
    {-0.6946583705, -0.7193398003},  // 224 deg
    {-0.7071067812, -0.7071067812},  // 225 deg
    {-0.7193398003, -0.6946583705},  // 226 deg
    {-0.7313537016, -0.6819983601},  // 227 deg
    {-0.7431448255, -0.6691306064},  // 228 deg
    {-0.7547095802, -0.6560590290},  // 229 deg
    {-0.7660444431, -0.6427876097},  // 230 deg
    {-0.7771459615, -0.6293203910},  // 231 deg
    {-0.7880107536, -0.6156614753},  // 232 deg
    {-0.7986355100, -0.6018150232},  // 233 deg
    {-0.8090169944, -0.5877852523},  // 234 deg
    {-0.8191520443, -0.5735764364},  // 235 deg
    {-0.8290375726, -0.5591929035},  // 236 deg
    {-0.8386705679, -0.5446390350},  // 237 deg
    {-0.8480480962, -0.5299192642},  // 238 deg
    {-0.8571673007, -0.5150380749},  // 239 deg
    {-0.8660254038, -0.5000000000},  // 240 deg
    {-0.8746197071, -0.4848096202},  // 241 deg
    {-0.8829475929, -0.4694715628},  // 242 deg
    {-0.8910065242, -0.4539904997},  // 243 deg
    {-0.8987940463, -0.4383711468},  // 244 deg
    {-0.9063077870, -0.4226182617},  // 245 deg
    {-0.9135454576, -0.4067366431},  // 246 deg
    {-0.9205048535, -0.3907311285},  // 247 deg
    {-0.9271838546, -0.3746065934},  // 248 deg
    {-0.9335804265, -0.3583679495},  // 249 deg
    {-0.9396926208, -0.3420201433},  // 250 deg
    {-0.9455185756, -0.3255681545},  // 251 deg
    {-0.9510565163, -0.3090169944},  // 252 deg
    {-0.9563047560, -0.2923717047},  // 253 deg
    {-0.9612616959, -0.2756373558},  // 254 deg
    {-0.9659258263, -0.2588190451},  // 255 deg
    {-0.9702957263, -0.2419218956},  // 256 deg
    {-0.9743700648, -0.2249510543},  // 257 deg
    {-0.9781476007, -0.2079116908},  // 258 deg
    {-0.9816271834, -0.1908089954},  // 259 deg
    {-0.9848077530, -0.1736481777},  // 260 deg
    {-0.9876883406, -0.1564344650},  // 261 deg
    {-0.9902680687, -0.1391731010},  // 262 deg
    {-0.9925461516, -0.1218693434},  // 263 deg
    {-0.9945218954, -0.1045284633},  // 264 deg
    {-0.9961946981, -0.0871557427},  // 265 deg
    {-0.9975640503, -0.0697564737},  // 266 deg
    {-0.9986295348, -0.0523359562},  // 267 deg
    {-0.9993908270, -0.0348994967},  // 268 deg
    {-0.9998476952, -0.0174524064},  // 269 deg
    {-1.0000000000, -0.0000000000},  // 270 deg
    {-0.9998476952, 0.0174524064},  // 271 deg
    {-0.9993908270, 0.0348994967},  // 272 deg
    {-0.9986295348, 0.0523359562},  // 273 deg
    {-0.9975640503, 0.0697564737},  // 274 deg
    {-0.9961946981, 0.0871557427},  // 275 deg
    {-0.9945218954, 0.1045284633},  // 276 deg
    {-0.9925461516, 0.1218693434},  // 277 deg
    {-0.9902680687, 0.1391731010},  // 278 deg
    {-0.9876883406, 0.1564344650},  // 279 deg
    {-0.9848077530, 0.1736481777},  // 280 deg
    {-0.9816271834, 0.1908089954},  // 281 deg
    {-0.9781476007, 0.2079116908},  // 282 deg
    {-0.9743700648, 0.2249510543},  // 283 deg
    {-0.9702957263, 0.2419218956},  // 284 deg
    {-0.9659258263, 0.2588190451},  // 285 deg
    {-0.9612616959, 0.2756373558},  // 286 deg
    {-0.9563047560, 0.2923717047},  // 287 deg
    {-0.9510565163, 0.3090169944},  // 288 deg
    {-0.9455185756, 0.3255681545},  // 289 deg
    {-0.9396926208, 0.3420201433},  // 290 deg
    {-0.9335804265, 0.3583679495},  // 291 deg
    {-0.9271838546, 0.3746065934},  // 292 deg
    {-0.9205048535, 0.3907311285},  // 293 deg
    {-0.9135454576, 0.4067366431},  // 294 deg
    {-0.9063077870, 0.4226182617},  // 295 deg
    {-0.8987940463, 0.4383711468},  // 296 deg
    {-0.8910065242, 0.4539904997},  // 297 deg
    {-0.8829475929, 0.4694715628},  // 298 deg
    {-0.8746197071, 0.4848096202},  // 299 deg
    {-0.8660254038, 0.5000000000},  // 300 deg
    {-0.8571673007, 0.5150380749},  // 301 deg
    {-0.8480480962, 0.5299192642},  // 302 deg
    {-0.8386705679, 0.5446390350},  // 303 deg
    {-0.8290375726, 0.5591929035},  // 304 deg
    {-0.8191520443, 0.5735764364},  // 305 deg
    {-0.8090169944, 0.5877852523},  // 306 deg
    {-0.7986355100, 0.6018150232},  // 307 deg
    {-0.7880107536, 0.6156614753},  // 308 deg
    {-0.7771459615, 0.6293203910},  // 309 deg
    {-0.7660444431, 0.6427876097},  // 310 deg
    {-0.7547095802, 0.6560590290},  // 311 deg
    {-0.7431448255, 0.6691306064},  // 312 deg
    {-0.7313537016, 0.6819983601},  // 313 deg
    {-0.7193398003, 0.6946583705},  // 314 deg
    {-0.7071067812, 0.7071067812},  // 315 deg
    {-0.6946583705, 0.7193398003},  // 316 deg
    {-0.6819983601, 0.7313537016},  // 317 deg
    {-0.6691306064, 0.7431448255},  // 318 deg
    {-0.6560590290, 0.7547095802},  // 319 deg
    {-0.6427876097, 0.7660444431},  // 320 deg
    {-0.6293203910, 0.7771459615},  // 321 deg
    {-0.6156614753, 0.7880107536},  // 322 deg
    {-0.6018150232, 0.7986355100},  // 323 deg
    {-0.5877852523, 0.8090169944},  // 324 deg
    {-0.5735764364, 0.8191520443},  // 325 deg
    {-0.5591929035, 0.8290375726},  // 326 deg
    {-0.5446390350, 0.8386705679},  // 327 deg
    {-0.5299192642, 0.8480480962},  // 328 deg
    {-0.5150380749, 0.8571673007},  // 329 deg
    {-0.5000000000, 0.8660254038},  // 330 deg
    {-0.4848096202, 0.8746197071},  // 331 deg
    {-0.4694715628, 0.8829475929},  // 332 deg
    {-0.4539904997, 0.8910065242},  // 333 deg
    {-0.4383711468, 0.8987940463},  // 334 deg
    {-0.4226182617, 0.9063077870},  // 335 deg
    {-0.4067366431, 0.9135454576},  // 336 deg
    {-0.3907311285, 0.9205048535},  // 337 deg
    {-0.3746065934, 0.9271838546},  // 338 deg
    {-0.3583679495, 0.9335804265},  // 339 deg
    {-0.3420201433, 0.9396926208},  // 340 deg
    {-0.3255681545, 0.9455185756},  // 341 deg
    {-0.3090169944, 0.9510565163},  // 342 deg
    {-0.2923717047, 0.9563047560},  // 343 deg
    {-0.2756373558, 0.9612616959},  // 344 deg
    {-0.2588190451, 0.9659258263},  // 345 deg
    {-0.2419218956, 0.9702957263},  // 346 deg
    {-0.2249510543, 0.9743700648},  // 347 deg
    {-0.2079116908, 0.9781476007},  // 348 deg
    {-0.1908089954, 0.9816271834},  // 349 deg
    {-0.1736481777, 0.9848077530},  // 350 deg
    {-0.1564344650, 0.9876883406},  // 351 deg
    {-0.1391731010, 0.9902680687},  // 352 deg
    {-0.1218693434, 0.9925461516},  // 353 deg
    {-0.1045284633, 0.9945218954},  // 354 deg
    {-0.0871557427, 0.9961946981},  // 355 deg
    {-0.0697564737, 0.9975640503},  // 356 deg
    {-0.0523359562, 0.9986295348},  // 357 deg
    {-0.0348994967, 0.9993908270},  // 358 deg
    {-0.0174524064, 0.9998476952},  // 359 deg
    {-0.0000000000, 1.0000000000},  // 360 deg
};

static float bit_pattern_31_x_[512] = {
    8, 9, 4, 7, -11, -8, 7, 12, 2, 2, 1, 1, -2, -2, -13, -11,
    -13, -12, 10, 11, -13, -8, -11, -9, 7, 12, -4, -3, -13, -12, -9, -7,
    12, 12, -3, -2, -6, -4, 11, 12, 4, 5, 5, 10, 3, 6, -8, -6,
    -2, -1, -13, -8, -7, -5, -4, -3, -10, -6, 5, 6, 5, 7, 1, 4,
    9, 11, 4, 4, 2, 4, -4, -2, -8, -7, 4, 9, 0, 1, -13, -8,
    -3, -2, -6, -4, 8, 10, 0, 1, 7, 11, -13, -11, 10, 12, -6, -6,
    10, 12, -13, -8, -13, -8, 3, 7, 5, 10, -1, 1, 3, 5, 2, 3,
    -13, -13, -13, -12, -13, -11, -7, -4, 6, 12, -9, -7, -2, 0, -12, -7,
    3, 8, -7, -4, -3, -1, 2, 5, -11, -5, -1, 0, 5, 5, -4, -4,
    -9, -9, -12, -8, 10, 12, 7, 12, -7, -6, -4, -3, 7, 12, -7, -5,
    -13, -12, -3, -2, 7, 12, -13, -11, 1, 12, 2, 3, -4, -2, -1, 1,
    7, 8, 1, 3, 9, 12, -1, -1, -13, -10, 7, 10, 12, 12, 6, 7,
    5, 6, 2, 2, 3, 4, 2, 12, 9, 10, -8, -7, -11, -4, 1, 2,
    6, 7, 2, 3, 6, 11, 3, 8, 7, 9, -11, -6, -10, -5, -5, -3,
    -10, -9, 8, 12, 4, 6, -10, -8, 4, 6, -2, -2, -5, -5, 7, 10,
    -9, -8, -5, -5, 8, 9, -9, -9, 1, 1, 7, 9, -2, -1, 11, 12,
    -12, -6, 3, 7, 5, 10, 0, 2, -9, -5, 0, 2, -1, 1, 5, 7,
    3, 6, -13, -8, -5, -3, -4, -3, 6, 8, -7, -6, -13, -5, 1, 3,
    4, 8, -2, 2, 2, 12, -2, 0, 4, 9, -6, -3, -3, -1, 7, 12,
    4, 5, -13, -9, 7, 8, 7, 7, -7, -7, -8, -7, -13, -12, 2, 3,
    10, 12, -6, -6, 8, 9, 2, 2, -11, -10, -12, -7, -11, -10, 5, 11,
    -2, -1, -1, 0, -13, -12, -10, -10, -3, -2, 2, 3, -9, -4, -4, -3,
    -4, -2, -6, -4, 6, 6, -13, -5, 11, 12, 7, 12, -1, 0, -4, -3,
    -7, -6, -13, -8, -7, -6, -8, -6, -5, -4, -13, -8, 1, 5, 1, 10,
    9, 10, 5, 10, -1, 1, -9, -6, -1, 1, -13, -8, 8, 10, 2, 3,
    7, 12, -10, -5, -10, -8, 4, 8, 3, 8, -4, -3, 5, 10, 4, 5,
    -9, -4, 0, 3, -12, -6, 3, 4, -10, -10, 8, 12, -8, -6, 2, 3,
    10, 11, 6, 8, -7, -6, -3, -3, -1, -1, -3, -3, -8, -8, 4, 12,
    2, 3, 6, 11, 3, 7, 11, 12, -3, -3, 4, 4, 2, 2, -10, -8,
    -13, -11, -13, -11, 6, 11, 0, 1, -13, -9, -9, -6, -13, -8, 5, 8,
    2, 3, -1, -1, 9, 11, 11, 12, 3, 3, -1, 0, 3, 4, -13, -10,
    5, 12, 8, 9, 7, 8, -10, -10, 7, 12, 9, 10, 7, 12, -1, 0
};

static float bit_pattern_31_y_[512] = {
    -3, 5, 2, -12, 9, 2, -12, -13, -13, 12, -7, 6, -10, -4, -13, -8,
    -3, -9, 4, 9, -8, -9, 7, 12, 7, 6, -5, 0, 2, -3, 0, 5,
    -6, -1, 6, 12, -13, -8, -13, -8, 7, 1, -3, -3, -7, 12, -7, -2,
    11, -10, 12, 10, 3, -3, 2, 7, -12, 11, -12, -7, -6, -1, 0, -5,
    11, -13, 7, 12, -1, 4, -12, 7, -5, -10, 11, 12, -8, -13, -2, 2,
    -2, 3, 9, -9, 12, 7, 9, 3, -5, -10, -6, 0, 7, 1, -3, 12,
    -9, -4, 8, -12, 0, -4, 3, 8, 7, -7, 7, -12, -10, 6, -4, -10,
    0, 5, -7, 12, 3, 8, 12, 7, -10, 8, -1, -6, -5, 12, 5, 5,
    -10, -13, -7, 5, -2, -7, 9, -11, -13, -13, 6, -1, -3, 2, -13, 12,
    -6, 6, -10, -4, 2, -3, 12, 12, -13, 5, 9, 4, -1, 2, 6, 1,
    11, 5, 7, -6, -8, -7, -7, -12, -3, 12, -6, 0, 3, -13, -13, 9,
    1, -6, -1, 12, 1, 6, -9, 3, -13, 5, 7, 12, -5, 9, 3, 11,
    -13, 10, -12, 3, 8, -6, 6, -13, -12, 3, 4, 9, 12, -6, 12, -8,
    -9, -4, 3, -2, 3, 0, -3, -8, 8, 3, -5, -4, 11, 10, -8, 12,
    5, 0, -1, -6, -6, -11, 12, 7, -2, 7, 0, 12, -8, 2, -6, 12,
    -13, -8, -13, -2, -8, -13, -11, 0, -8, -2, -4, 1, 1, -4, -6, -11,
    -9, 4, 7, 12, 5, 8, -4, 8, 12, -13, 7, 12, 2, 7, 11, -9,
    5, -8, -4, 9, 9, -3, -7, -12, 5, 0, 6, 12, 6, -2, -10, 10,
    1, -4, -2, -13, -12, 12, -13, -6, 1, 3, -10, -5, -13, 1, 5, -11,
    -2, -7, 9, -5, 1, 6, -8, 6, -4, 1, 11, -8, 6, -8, 4, 9,
    -5, 3, -5, 7, -3, -8, -12, 8, -2, 3, -13, -9, 0, -5, -3, 8,
    -13, 12, -8, 9, -11, -5, -2, 11, 9, -13, -3, 2, -13, 0, 6, -10,
    12, -7, -11, 9, -3, 11, 11, 5, 11, 6, -5, -2, 12, 7, -8, -2,
    1, 7, -12, -13, -2, -8, 5, -9, -1, 5, 7, 10, 5, -13, 0, -13,
    12, -1, -8, -9, 11, -13, -3, 2, -10, 12, 1, -10, -11, -6, -13, -6,
    -13, -9, -10, -7, -8, -13, -6, 5, 12, -13, 2, -3, -13, -12, -13, -1,
    9, 3, 3, -9, 1, 1, 2, -8, -10, 9, -13, 12, -12, -5, 2, 7,
    6, -8, 8, -12, 10, 5, -9, 9, -13, 5, -7, 4, -2, 3, 2, 12,
    -5, 11, -9, -13, -1, 12, -1, 4, 0, 6, -11, 12, -4, 1, -6, 1,
    7, 1, 12, -13, 0, -13, -1, 4, 3, -2, 8, -3, -6, -2, -9, 10,
    7, -9, -6, -1, 5, -2, -3, -8, 0, 5, 4, 10, -6, 5, 0, 5,
    8, 11, 9, -6, -4, -12, 4, 9, 3, 4, -7, -2, 0, -2, -6, -11
};

static int bit_pattern_31_[256*4] = {
    8, -3, 9, 5/*mean (0), correlation (0)*/,
    4, 2, 7, -12/*mean (1.12461e-05), correlation (0.0437584)*/,
    -11, 9, -8, 2/*mean (3.37382e-05), correlation (0.0617409)*/,
    7, -12, 12, -13/*mean (5.62303e-05), correlation (0.0636977)*/,
    2, -13, 2, 12/*mean (0.000134953), correlation (0.085099)*/,
    1, -7, 1, 6/*mean (0.000528565), correlation (0.0857175)*/,
    -2, -10, -2, -4/*mean (0.0188821), correlation (0.0985774)*/,
    -13, -13, -11, -8/*mean (0.0363135), correlation (0.0899616)*/,
    -13, -3, -12, -9/*mean (0.121806), correlation (0.099849)*/,
    10, 4, 11, 9/*mean (0.122065), correlation (0.093285)*/,
    -13, -8, -8, -9/*mean (0.162787), correlation (0.0942748)*/,
    -11, 7, -9, 12/*mean (0.21561), correlation (0.0974438)*/,
    7, 7, 12, 6/*mean (0.160583), correlation (0.130064)*/,
    -4, -5, -3, 0/*mean (0.228171), correlation (0.132998)*/,
    -13, 2, -12, -3/*mean (0.00997526), correlation (0.145926)*/,
    -9, 0, -7, 5/*mean (0.198234), correlation (0.143636)*/,
    12, -6, 12, -1/*mean (0.0676226), correlation (0.16689)*/,
    -3, 6, -2, 12/*mean (0.166847), correlation (0.171682)*/,
    -6, -13, -4, -8/*mean (0.101215), correlation (0.179716)*/,
    11, -13, 12, -8/*mean (0.200641), correlation (0.192279)*/,
    4, 7, 5, 1/*mean (0.205106), correlation (0.186848)*/,
    5, -3, 10, -3/*mean (0.234908), correlation (0.192319)*/,
    3, -7, 6, 12/*mean (0.0709964), correlation (0.210872)*/,
    -8, -7, -6, -2/*mean (0.0939834), correlation (0.212589)*/,
    -2, 11, -1, -10/*mean (0.127778), correlation (0.20866)*/,
    -13, 12, -8, 10/*mean (0.14783), correlation (0.206356)*/,
    -7, 3, -5, -3/*mean (0.182141), correlation (0.198942)*/,
    -4, 2, -3, 7/*mean (0.188237), correlation (0.21384)*/,
    -10, -12, -6, 11/*mean (0.14865), correlation (0.23571)*/,
    5, -12, 6, -7/*mean (0.222312), correlation (0.23324)*/,
    5, -6, 7, -1/*mean (0.229082), correlation (0.23389)*/,
    1, 0, 4, -5/*mean (0.241577), correlation (0.215286)*/,
    9, 11, 11, -13/*mean (0.00338507), correlation (0.251373)*/,
    4, 7, 4, 12/*mean (0.131005), correlation (0.257622)*/,
    2, -1, 4, 4/*mean (0.152755), correlation (0.255205)*/,
    -4, -12, -2, 7/*mean (0.182771), correlation (0.244867)*/,
    -8, -5, -7, -10/*mean (0.186898), correlation (0.23901)*/,
    4, 11, 9, 12/*mean (0.226226), correlation (0.258255)*/,
    0, -8, 1, -13/*mean (0.0897886), correlation (0.274827)*/,
    -13, -2, -8, 2/*mean (0.148774), correlation (0.28065)*/,
    -3, -2, -2, 3/*mean (0.153048), correlation (0.283063)*/,
    -6, 9, -4, -9/*mean (0.169523), correlation (0.278248)*/,
    8, 12, 10, 7/*mean (0.225337), correlation (0.282851)*/,
    0, 9, 1, 3/*mean (0.226687), correlation (0.278734)*/,
    7, -5, 11, -10/*mean (0.00693882), correlation (0.305161)*/,
    -13, -6, -11, 0/*mean (0.0227283), correlation (0.300181)*/,
    10, 7, 12, 1/*mean (0.125517), correlation (0.31089)*/,
    -6, -3, -6, 12/*mean (0.131748), correlation (0.312779)*/,
    10, -9, 12, -4/*mean (0.144827), correlation (0.292797)*/,
    -13, 8, -8, -12/*mean (0.149202), correlation (0.308918)*/,
    -13, 0, -8, -4/*mean (0.160909), correlation (0.310013)*/,
    3, 3, 7, 8/*mean (0.177755), correlation (0.309394)*/,
    5, 7, 10, -7/*mean (0.212337), correlation (0.310315)*/,
    -1, 7, 1, -12/*mean (0.214429), correlation (0.311933)*/,
    3, -10, 5, 6/*mean (0.235807), correlation (0.313104)*/,
    2, -4, 3, -10/*mean (0.00494827), correlation (0.344948)*/,
    -13, 0, -13, 5/*mean (0.0549145), correlation (0.344675)*/,
    -13, -7, -12, 12/*mean (0.103385), correlation (0.342715)*/,
    -13, 3, -11, 8/*mean (0.134222), correlation (0.322922)*/,
    -7, 12, -4, 7/*mean (0.153284), correlation (0.337061)*/,
    6, -10, 12, 8/*mean (0.154881), correlation (0.329257)*/,
    -9, -1, -7, -6/*mean (0.200967), correlation (0.33312)*/,
    -2, -5, 0, 12/*mean (0.201518), correlation (0.340635)*/,
    -12, 5, -7, 5/*mean (0.207805), correlation (0.335631)*/,
    3, -10, 8, -13/*mean (0.224438), correlation (0.34504)*/,
    -7, -7, -4, 5/*mean (0.239361), correlation (0.338053)*/,
    -3, -2, -1, -7/*mean (0.240744), correlation (0.344322)*/,
    2, 9, 5, -11/*mean (0.242949), correlation (0.34145)*/,
    -11, -13, -5, -13/*mean (0.244028), correlation (0.336861)*/,
    -1, 6, 0, -1/*mean (0.247571), correlation (0.343684)*/,
    5, -3, 5, 2/*mean (0.000697256), correlation (0.357265)*/,
    -4, -13, -4, 12/*mean (0.00213675), correlation (0.373827)*/,
    -9, -6, -9, 6/*mean (0.0126856), correlation (0.373938)*/,
    -12, -10, -8, -4/*mean (0.0152497), correlation (0.364237)*/,
    10, 2, 12, -3/*mean (0.0299933), correlation (0.345292)*/,
    7, 12, 12, 12/*mean (0.0307242), correlation (0.366299)*/,
    -7, -13, -6, 5/*mean (0.0534975), correlation (0.368357)*/,
    -4, 9, -3, 4/*mean (0.099865), correlation (0.372276)*/,
    7, -1, 12, 2/*mean (0.117083), correlation (0.364529)*/,
    -7, 6, -5, 1/*mean (0.126125), correlation (0.369606)*/,
    -13, 11, -12, 5/*mean (0.130364), correlation (0.358502)*/,
    -3, 7, -2, -6/*mean (0.131691), correlation (0.375531)*/,
    7, -8, 12, -7/*mean (0.160166), correlation (0.379508)*/,
    -13, -7, -11, -12/*mean (0.167848), correlation (0.353343)*/,
    1, -3, 12, 12/*mean (0.183378), correlation (0.371916)*/,
    2, -6, 3, 0/*mean (0.228711), correlation (0.371761)*/,
    -4, 3, -2, -13/*mean (0.247211), correlation (0.364063)*/,
    -1, -13, 1, 9/*mean (0.249325), correlation (0.378139)*/,
    7, 1, 8, -6/*mean (0.000652272), correlation (0.411682)*/,
    1, -1, 3, 12/*mean (0.00248538), correlation (0.392988)*/,
    9, 1, 12, 6/*mean (0.0206815), correlation (0.386106)*/,
    -1, -9, -1, 3/*mean (0.0364485), correlation (0.410752)*/,
    -13, -13, -10, 5/*mean (0.0376068), correlation (0.398374)*/,
    7, 7, 10, 12/*mean (0.0424202), correlation (0.405663)*/,
    12, -5, 12, 9/*mean (0.0942645), correlation (0.410422)*/,
    6, 3, 7, 11/*mean (0.1074), correlation (0.413224)*/,
    5, -13, 6, 10/*mean (0.109256), correlation (0.408646)*/,
    2, -12, 2, 3/*mean (0.131691), correlation (0.416076)*/,
    3, 8, 4, -6/*mean (0.165081), correlation (0.417569)*/,
    2, 6, 12, -13/*mean (0.171874), correlation (0.408471)*/,
    9, -12, 10, 3/*mean (0.175146), correlation (0.41296)*/,
    -8, 4, -7, 9/*mean (0.183682), correlation (0.402956)*/,
    -11, 12, -4, -6/*mean (0.184672), correlation (0.416125)*/,
    1, 12, 2, -8/*mean (0.191487), correlation (0.386696)*/,
    6, -9, 7, -4/*mean (0.192668), correlation (0.394771)*/,
    2, 3, 3, -2/*mean (0.200157), correlation (0.408303)*/,
    6, 3, 11, 0/*mean (0.204588), correlation (0.411762)*/,
    3, -3, 8, -8/*mean (0.205904), correlation (0.416294)*/,
    7, 8, 9, 3/*mean (0.213237), correlation (0.409306)*/,
    -11, -5, -6, -4/*mean (0.243444), correlation (0.395069)*/,
    -10, 11, -5, 10/*mean (0.247672), correlation (0.413392)*/,
    -5, -8, -3, 12/*mean (0.24774), correlation (0.411416)*/,
    -10, 5, -9, 0/*mean (0.00213675), correlation (0.454003)*/,
    8, -1, 12, -6/*mean (0.0293635), correlation (0.455368)*/,
    4, -6, 6, -11/*mean (0.0404971), correlation (0.457393)*/,
    -10, 12, -8, 7/*mean (0.0481107), correlation (0.448364)*/,
    4, -2, 6, 7/*mean (0.050641), correlation (0.455019)*/,
    -2, 0, -2, 12/*mean (0.0525978), correlation (0.44338)*/,
    -5, -8, -5, 2/*mean (0.0629667), correlation (0.457096)*/,
    7, -6, 10, 12/*mean (0.0653846), correlation (0.445623)*/,
    -9, -13, -8, -8/*mean (0.0858749), correlation (0.449789)*/,
    -5, -13, -5, -2/*mean (0.122402), correlation (0.450201)*/,
    8, -8, 9, -13/*mean (0.125416), correlation (0.453224)*/,
    -9, -11, -9, 0/*mean (0.130128), correlation (0.458724)*/,
    1, -8, 1, -2/*mean (0.132467), correlation (0.440133)*/,
    7, -4, 9, 1/*mean (0.132692), correlation (0.454)*/,
    -2, 1, -1, -4/*mean (0.135695), correlation (0.455739)*/,
    11, -6, 12, -11/*mean (0.142904), correlation (0.446114)*/,
    -12, -9, -6, 4/*mean (0.146165), correlation (0.451473)*/,
    3, 7, 7, 12/*mean (0.147627), correlation (0.456643)*/,
    5, 5, 10, 8/*mean (0.152901), correlation (0.455036)*/,
    0, -4, 2, 8/*mean (0.167083), correlation (0.459315)*/,
    -9, 12, -5, -13/*mean (0.173234), correlation (0.454706)*/,
    0, 7, 2, 12/*mean (0.18312), correlation (0.433855)*/,
    -1, 2, 1, 7/*mean (0.185504), correlation (0.443838)*/,
    5, 11, 7, -9/*mean (0.185706), correlation (0.451123)*/,
    3, 5, 6, -8/*mean (0.188968), correlation (0.455808)*/,
    -13, -4, -8, 9/*mean (0.191667), correlation (0.459128)*/,
    -5, 9, -3, -3/*mean (0.193196), correlation (0.458364)*/,
    -4, -7, -3, -12/*mean (0.196536), correlation (0.455782)*/,
    6, 5, 8, 0/*mean (0.1972), correlation (0.450481)*/,
    -7, 6, -6, 12/*mean (0.199438), correlation (0.458156)*/,
    -13, 6, -5, -2/*mean (0.211224), correlation (0.449548)*/,
    1, -10, 3, 10/*mean (0.211718), correlation (0.440606)*/,
    4, 1, 8, -4/*mean (0.213034), correlation (0.443177)*/,
    -2, -2, 2, -13/*mean (0.234334), correlation (0.455304)*/,
    2, -12, 12, 12/*mean (0.235684), correlation (0.443436)*/,
    -2, -13, 0, -6/*mean (0.237674), correlation (0.452525)*/,
    4, 1, 9, 3/*mean (0.23962), correlation (0.444824)*/,
    -6, -10, -3, -5/*mean (0.248459), correlation (0.439621)*/,
    -3, -13, -1, 1/*mean (0.249505), correlation (0.456666)*/,
    7, 5, 12, -11/*mean (0.00119208), correlation (0.495466)*/,
    4, -2, 5, -7/*mean (0.00372245), correlation (0.484214)*/,
    -13, 9, -9, -5/*mean (0.00741116), correlation (0.499854)*/,
    7, 1, 8, 6/*mean (0.0208952), correlation (0.499773)*/,
    7, -8, 7, 6/*mean (0.0220085), correlation (0.501609)*/,
    -7, -4, -7, 1/*mean (0.0233806), correlation (0.496568)*/,
    -8, 11, -7, -8/*mean (0.0236505), correlation (0.489719)*/,
    -13, 6, -12, -8/*mean (0.0268781), correlation (0.503487)*/,
    2, 4, 3, 9/*mean (0.0323324), correlation (0.501938)*/,
    10, -5, 12, 3/*mean (0.0399235), correlation (0.494029)*/,
    -6, -5, -6, 7/*mean (0.0420153), correlation (0.486579)*/,
    8, -3, 9, -8/*mean (0.0548021), correlation (0.484237)*/,
    2, -12, 2, 8/*mean (0.0616622), correlation (0.496642)*/,
    -11, -2, -10, 3/*mean (0.0627755), correlation (0.498563)*/,
    -12, -13, -7, -9/*mean (0.0829622), correlation (0.495491)*/,
    -11, 0, -10, -5/*mean (0.0843342), correlation (0.487146)*/,
    5, -3, 11, 8/*mean (0.0929937), correlation (0.502315)*/,
    -2, -13, -1, 12/*mean (0.113327), correlation (0.48941)*/,
    -1, -8, 0, 9/*mean (0.132119), correlation (0.467268)*/,
    -13, -11, -12, -5/*mean (0.136269), correlation (0.498771)*/,
    -10, -2, -10, 11/*mean (0.142173), correlation (0.498714)*/,
    -3, 9, -2, -13/*mean (0.144141), correlation (0.491973)*/,
    2, -3, 3, 2/*mean (0.14892), correlation (0.500782)*/,
    -9, -13, -4, 0/*mean (0.150371), correlation (0.498211)*/,
    -4, 6, -3, -10/*mean (0.152159), correlation (0.495547)*/,
    -4, 12, -2, -7/*mean (0.156152), correlation (0.496925)*/,
    -6, -11, -4, 9/*mean (0.15749), correlation (0.499222)*/,
    6, -3, 6, 11/*mean (0.159211), correlation (0.503821)*/,
    -13, 11, -5, 5/*mean (0.162427), correlation (0.501907)*/,
    11, 11, 12, 6/*mean (0.16652), correlation (0.497632)*/,
    7, -5, 12, -2/*mean (0.169141), correlation (0.484474)*/,
    -1, 12, 0, 7/*mean (0.169456), correlation (0.495339)*/,
    -4, -8, -3, -2/*mean (0.171457), correlation (0.487251)*/,
    -7, 1, -6, 7/*mean (0.175), correlation (0.500024)*/,
    -13, -12, -8, -13/*mean (0.175866), correlation (0.497523)*/,
    -7, -2, -6, -8/*mean (0.178273), correlation (0.501854)*/,
    -8, 5, -6, -9/*mean (0.181107), correlation (0.494888)*/,
    -5, -1, -4, 5/*mean (0.190227), correlation (0.482557)*/,
    -13, 7, -8, 10/*mean (0.196739), correlation (0.496503)*/,
    1, 5, 5, -13/*mean (0.19973), correlation (0.499759)*/,
    1, 0, 10, -13/*mean (0.204465), correlation (0.49873)*/,
    9, 12, 10, -1/*mean (0.209334), correlation (0.49063)*/,
    5, -8, 10, -9/*mean (0.211134), correlation (0.503011)*/,
    -1, 11, 1, -13/*mean (0.212), correlation (0.499414)*/,
    -9, -3, -6, 2/*mean (0.212168), correlation (0.480739)*/,
    -1, -10, 1, 12/*mean (0.212731), correlation (0.502523)*/,
    -13, 1, -8, -10/*mean (0.21327), correlation (0.489786)*/,
    8, -11, 10, -6/*mean (0.214159), correlation (0.488246)*/,
    2, -13, 3, -6/*mean (0.216993), correlation (0.50287)*/,
    7, -13, 12, -9/*mean (0.223639), correlation (0.470502)*/,
    -10, -10, -5, -7/*mean (0.224089), correlation (0.500852)*/,
    -10, -8, -8, -13/*mean (0.228666), correlation (0.502629)*/,
    4, -6, 8, 5/*mean (0.22906), correlation (0.498305)*/,
    3, 12, 8, -13/*mean (0.233378), correlation (0.503825)*/,
    -4, 2, -3, -3/*mean (0.234323), correlation (0.476692)*/,
    5, -13, 10, -12/*mean (0.236392), correlation (0.475462)*/,
    4, -13, 5, -1/*mean (0.236842), correlation (0.504132)*/,
    -9, 9, -4, 3/*mean (0.236977), correlation (0.497739)*/,
    0, 3, 3, -9/*mean (0.24314), correlation (0.499398)*/,
    -12, 1, -6, 1/*mean (0.243297), correlation (0.489447)*/,
    3, 2, 4, -8/*mean (0.00155196), correlation (0.553496)*/,
    -10, -10, -10, 9/*mean (0.00239541), correlation (0.54297)*/,
    8, -13, 12, 12/*mean (0.0034413), correlation (0.544361)*/,
    -8, -12, -6, -5/*mean (0.003565), correlation (0.551225)*/,
    2, 2, 3, 7/*mean (0.00835583), correlation (0.55285)*/,
    10, 6, 11, -8/*mean (0.00885065), correlation (0.540913)*/,
    6, 8, 8, -12/*mean (0.0101552), correlation (0.551085)*/,
    -7, 10, -6, 5/*mean (0.0102227), correlation (0.533635)*/,
    -3, -9, -3, 9/*mean (0.0110211), correlation (0.543121)*/,
    -1, -13, -1, 5/*mean (0.0113473), correlation (0.550173)*/,
    -3, -7, -3, 4/*mean (0.0140913), correlation (0.554774)*/,
    -8, -2, -8, 3/*mean (0.017049), correlation (0.55461)*/,
    4, 2, 12, 12/*mean (0.01778), correlation (0.546921)*/,
    2, -5, 3, 11/*mean (0.0224022), correlation (0.549667)*/,
    6, -9, 11, -13/*mean (0.029161), correlation (0.546295)*/,
    3, -1, 7, 12/*mean (0.0303081), correlation (0.548599)*/,
    11, -1, 12, 4/*mean (0.0355151), correlation (0.523943)*/,
    -3, 0, -3, 6/*mean (0.0417904), correlation (0.543395)*/,
    4, -11, 4, 12/*mean (0.0487292), correlation (0.542818)*/,
    2, -4, 2, 1/*mean (0.0575124), correlation (0.554888)*/,
    -10, -6, -8, 1/*mean (0.0594242), correlation (0.544026)*/,
    -13, 7, -11, 1/*mean (0.0597391), correlation (0.550524)*/,
    -13, 12, -11, -13/*mean (0.0608974), correlation (0.55383)*/,
    6, 0, 11, -13/*mean (0.065126), correlation (0.552006)*/,
    0, -1, 1, 4/*mean (0.074224), correlation (0.546372)*/,
    -13, 3, -9, -2/*mean (0.0808592), correlation (0.554875)*/,
    -9, 8, -6, -3/*mean (0.0883378), correlation (0.551178)*/,
    -13, -6, -8, -2/*mean (0.0901035), correlation (0.548446)*/,
    5, -9, 8, 10/*mean (0.0949843), correlation (0.554694)*/,
    2, 7, 3, -9/*mean (0.0994152), correlation (0.550979)*/,
    -1, -6, -1, -1/*mean (0.10045), correlation (0.552714)*/,
    9, 5, 11, -2/*mean (0.100686), correlation (0.552594)*/,
    11, -3, 12, -8/*mean (0.101091), correlation (0.532394)*/,
    3, 0, 3, 5/*mean (0.101147), correlation (0.525576)*/,
    -1, 4, 0, 10/*mean (0.105263), correlation (0.531498)*/,
    3, -6, 4, 5/*mean (0.110785), correlation (0.540491)*/,
    -13, 0, -10, 5/*mean (0.112798), correlation (0.536582)*/,
    5, 8, 12, 11/*mean (0.114181), correlation (0.555793)*/,
    8, 9, 9, -6/*mean (0.117431), correlation (0.553763)*/,
    7, -4, 8, -12/*mean (0.118522), correlation (0.553452)*/,
    -10, 4, -10, 9/*mean (0.12094), correlation (0.554785)*/,
    7, 3, 12, 4/*mean (0.122582), correlation (0.555825)*/,
    9, -7, 10, -2/*mean (0.124978), correlation (0.549846)*/,
    7, 0, 12, -2/*mean (0.127002), correlation (0.537452)*/,
    -1, -6, 0, -11/*mean (0.127148), correlation (0.547401)*/
};

const float HARRIS_K = 0.04f;
const int PATCH_SIZE = 31;
const int HALF_PATCH_SIZE = 15;
const int EDGE_THRESHOLD = 16;
const float factorPI = static_cast<float>(CV_PI / 180.f);

static bool greaterThanPtr(const float* i, const float* j) {
  return (*i > *j);
}

static void goodFeaturesToTrack_neon(const cv::Mat& image0,
                                     std::vector<cv::Point2f>& corners,  // NOLINT
                                     int maxCorners,
                                     double qualityLevel,
                                     double minDistance) {
  auto start_time_goodFeaturesToTrack = std::chrono::high_resolution_clock::now();
#ifdef __ARM_NEON__
  // STEP 1: calculate eig
  cv::Mat image = image0.clone();
  cv::Mat eig;
  eig.create(image.size(), CV_32F);
  if (!image.isContinuous() || !eig.isContinuous() || image.type() != CV_8UC1) {
    LOG(FATAL) << "FAILED at the very beginning..";
  }
  int w = image.cols;
  int h = image.rows;
  float *eigdata = reinterpret_cast<float*>(eig.data);
  float *cov = NULL;
  if (posix_memalign(reinterpret_cast<void**>(&cov), 16, w*h*3*sizeof(float))) {
    LOG(FATAL) << "posix_memalign FAILED..";
  }
  memset(eigdata, 0, w * sizeof(float));
  memset(eigdata + (w * (h - 1)), 0, w * sizeof(float));
  memset(cov, 0, w * 3 * sizeof(float));
  memset(cov + (w * 3 * (h - 1)), 0, w * 3 * sizeof(float));

  float* dest = cov + w * 3;
  const unsigned char* const srcmax = reinterpret_cast<unsigned char*>(image.data) + w * (h - 1);
  /*
   __m128 dxdx_prev = _mm_setzero_ps();
   __m128 dxdy_prev = _mm_setzero_ps();
   __m128 dydy_prev = _mm_setzero_ps();
   __m128 dxdx_sum_prev = _mm_setzero_ps();
   __m128 dxdy_sum_prev = _mm_setzero_ps();
   __m128 dydy_sum_prev = _mm_setzero_ps();
   */
  float32x4_t dxdx_prev = vdupq_n_f32(0.0f);
  float32x4_t dxdy_prev = vdupq_n_f32(0.0f);
  float32x4_t dydy_prev = vdupq_n_f32(0.0f);
  float32x4_t dxdx_sum_prev = vdupq_n_f32(0.0f);
  float32x4_t dxdy_sum_prev = vdupq_n_f32(0.0f);
  float32x4_t dydy_sum_prev = vdupq_n_f32(0.0f);
  float harris_k = 0.04;

  for (const unsigned char *p = reinterpret_cast<unsigned char*>(image.data) + w;
       p < srcmax; p += 16) {
    /*
     __m128i in_u = _mm_load_si128(reinterpret_cast<const __m128i*>(p - w));
     __m128i in_d = _mm_load_si128(reinterpret_cast<const __m128i*>(p + w));
     __m128i in_l = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p - 1));
     __m128i in_r = _mm_loadu_si128(reinterpret_cast<const __m128i*>(p + 1));
     */
    // NOTE: www.itlab.unn.ru/file.php?id=731 "ARM NEON SIMD" lecture pdf
    // Page 37: "NEON - no way to specify alignment for intrinsics"
    // The (const int32_t*) cast seems working fine and gets recommended (Page 43).
    int32x4_t in_l = vld1q_s32((const int32_t*)(p - 1));
    int32x4_t in_r = vld1q_s32((const int32_t*)(p + 1));
    int32x4_t in_u = vld1q_s32((const int32_t*)(p - w));
    int32x4_t in_d = vld1q_s32((const int32_t*)(p + w));

    /*
     __m128i dx16[2] = {
     _mm_sub_epi16(_mm_cvtepu8_epi16(in_r), _mm_cvtepu8_epi16(in_l)),
     _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(in_r, 8)),
     _mm_cvtepu8_epi16(_mm_srli_si128(in_l, 8)))
     };
     __m128i dy16[2] = {
     _mm_sub_epi16(_mm_cvtepu8_epi16(in_d), _mm_cvtepu8_epi16(in_u)),
     _mm_sub_epi16(_mm_cvtepu8_epi16(_mm_srli_si128(in_d, 8)),
     _mm_cvtepu8_epi16(_mm_srli_si128(in_u, 8)))
     };
     */
    // NOTE: _mm_cvtepu8_epi16 is "zero-extended" and treated as unsigned
    // A great reference check for NEON vs SSE: https://software.intel.com/sites/default/files/managed/cf/f6/NEONvsSSE.h
    // int32x4_t -> int32x2_t -> uint8x8_t -> uint16x8_t
    uint16x8_t in_r_low = vmovl_u8(vreinterpret_u8_s32(vget_low_s32(in_r)));
    uint16x8_t in_l_low = vmovl_u8(vreinterpret_u8_s32(vget_low_s32(in_l)));
    uint16x8_t in_r_high = vmovl_u8(vreinterpret_u8_s32(vget_high_s32(in_r)));
    uint16x8_t in_l_high = vmovl_u8(vreinterpret_u8_s32(vget_high_s32(in_l)));
    uint16x8_t in_d_low = vmovl_u8(vreinterpret_u8_s32(vget_low_s32(in_d)));
    uint16x8_t in_u_low = vmovl_u8(vreinterpret_u8_s32(vget_low_s32(in_u)));
    uint16x8_t in_d_high = vmovl_u8(vreinterpret_u8_s32(vget_high_s32(in_d)));
    uint16x8_t in_u_high = vmovl_u8(vreinterpret_u8_s32(vget_high_s32(in_u)));
    // todo: confirm that re-interpret uint16x8_t -> int16x8_t before subtraction is good practice
    int16x8_t dx16[2] = {
      vsubq_s16(vreinterpretq_s16_u16(in_r_low), vreinterpretq_s16_u16(in_l_low)),
      vsubq_s16(vreinterpretq_s16_u16(in_r_high), vreinterpretq_s16_u16(in_l_high))};
    int16x8_t dy16[2] = {
      vsubq_s16(vreinterpretq_s16_u16(in_d_low), vreinterpretq_s16_u16(in_u_low)),
      vsubq_s16(vreinterpretq_s16_u16(in_d_high), vreinterpretq_s16_u16(in_u_high))};

    /*
     __m128 dx[4] = {
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(dx16[0])),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(dx16[0], 8))),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(dx16[1])),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(dx16[1], 8)))
     };
     __m128 dy[4] = {
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(dy16[0])),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(dy16[0], 8))),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(dy16[1])),
     _mm_cvtepi32_ps(_mm_cvtepi16_epi32(_mm_srli_si128(dy16[1], 8)))
     };
     http://stackoverflow.com/questions/30528352/how-to-convert-unsigned-char-to-signed-integer-by-neon
     "The NEON vector types are not guaranteed to be convertible by casts,
     so for most portability you should write vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(v)))."
     _mm_cvtepi16_epi32 is sign-extended, and vmovl_s16 is sign-extended.
     NOTE: _mm_cvtepi32_ps shoud be: float32x4_t vcvtq_f32_s32(int32x4_t a);
     NOTE: dx16/dy16: int16x8_t -> int16x4_t -> int32x4_t -> float32x4_t
     */
    float32x4_t dx[4] = {
      vcvtq_f32_s32(vmovl_s16(vget_low_s16(dx16[0]))),
      vcvtq_f32_s32(vmovl_s16(vget_high_s16(dx16[0]))),
      vcvtq_f32_s32(vmovl_s16(vget_low_s16(dx16[1]))),
      vcvtq_f32_s32(vmovl_s16(vget_high_s16(dx16[1])))
    };
    float32x4_t dy[4] = {
      vcvtq_f32_s32(vmovl_s16(vget_low_s16(dy16[0]))),
      vcvtq_f32_s32(vmovl_s16(vget_high_s16(dy16[0]))),
      vcvtq_f32_s32(vmovl_s16(vget_low_s16(dy16[1]))),
      vcvtq_f32_s32(vmovl_s16(vget_high_s16(dy16[1])))
    };


    // NOTE(): bracket No.1
    {
    // We process in one iteration:
    // |p1 p2 p3 p4|a b c d|e f g h|i j k l|m n o p|

    /***** Previous Cell starts here *****/
    // |a b c d|e f g h|
    /*
     __m128 product[2] = {
     _mm_mul_ps(dx[0], dx[0]), _mm_mul_ps(dx[1], dx[1])
     };
     */
    float32x4_t product[2] = {vmulq_f32(dx[0], dx[0]), vmulq_f32(dx[1], dx[1])};

    // |p2 p3 p4 0| AND |0 0 0 a| => |p2 p3 p4 a|
    /*
     __m128 shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dxdx_prev), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));
     NOTE: http://stackoverflow.com/questions/11259596/arm-neon-intrinsics-rotation
     */
    float32x4_t shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(dxdx_prev), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 1)));

    // Store |p2 p3 p4 a| + prev_sum
    /*
     _mm_store_ps(dest-36, _mm_add_ps(shiftfrom_r, dxdx_sum_prev));
     */
    vst1q_f32(dest - 36, vaddq_f32(shiftfrom_r, dxdx_sum_prev));
    /***** Previous Cell ends here *****/


    /***** First Cell starts here *****/
    /*
     // |0 a b c| AND |p4 0 0 0| => |p4 a b c|
     __m128 shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dxdx_prev), 12)));
     */
    float32x4_t shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 3),
        vextq_s32(vreinterpretq_s32_f32(dxdx_prev), vdupq_n_s32(0), 3)));
    /*
     // |b c d 0| AND |0 0 0 e| => |b c d e|
     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));
     */
    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));

    /*
     // Store |p4 a b c| +  |a b c d| + |b c d e|
     _mm_store_ps(dest,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));
     */
    vst1q_f32(dest, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));
    /***** First Cell starts here *****/


    /***** Second Cell starts here *****/
    /*
     // |0 e f g| AND |d 0 0 0| => |d e f g|
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));
     */
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));

    /*
     // |i j k l|
     product[0] = _mm_mul_ps(dx[2], dx[2]);
     */
    product[0] = vmulq_f32(dx[2], dx[2]);

    /*
     // |f g h 0| AND |0 0 0 i| => |f g h i|
     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));

     // Store |d e f g| +  |e f g h| + |f g h i|
     _mm_store_ps(dest+4,
     _mm_add_ps(_mm_add_ps(product[1], shiftfrom_l), shiftfrom_r));
     */
    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1)));
    vst1q_f32(dest + 4, vaddq_f32(vaddq_f32(product[1], shiftfrom_l), shiftfrom_r));
    /***** Second Cell starts here *****/


    /***** Third Cell starts here *****/
    /*
     // |0 i j k| AND |h 0 0 0| => |h i j k|
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 12)));
     */
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3)));

    /*
     // |m n o p|
     product[1] = _mm_mul_ps(dx[3], dx[3]);
     */
    product[1] = vmulq_f32(dx[3], dx[3]);

    /*
     // |j k l 0| AND |0 0 0 m| => |j k l m|
     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));

     // Store |h i j k| +  |i j k l| + |j k l m|
     _mm_store_ps(dest+8,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));
     */
    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));
    vst1q_f32(dest + 8, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));
    /***** Third Cell starts here *****/


    /***** Fourth Cell starts here *****/
    /*
     // |0 m n o| AND |l 0 0 0| => |l m n o|
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));
     
     // Store |l m n o| +  |m n o p|
     dxdx_sum_prev = _mm_add_ps(product[1], shiftfrom_l);
     dxdx_prev = product[1];
     */
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));
    dxdx_sum_prev = vaddq_f32(product[1], shiftfrom_l);
    dxdx_prev = product[1];
    /***** Fourth Cell starts here *****/

    dest += 16;
    }


    // NOTE(): bracket No.2
    {
    /*
     // ######## Previous Cell
     __m128 product[2] = {
     _mm_mul_ps(dx[0], dy[0]), _mm_mul_ps(dx[1], dy[1])
     };

     __m128 shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dxdy_prev), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));

     _mm_store_ps(dest-36, _mm_add_ps(shiftfrom_r, dxdy_sum_prev));


     // ######## First Cell
     __m128 shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dxdy_prev), 12)));

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));

     _mm_store_ps(dest,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));


     // ######## Second Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));

     product[0] = _mm_mul_ps(dx[2], dy[2]);

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));

     _mm_store_ps(dest+4,
     _mm_add_ps(_mm_add_ps(product[1], shiftfrom_l), shiftfrom_r));


     // ######## Third Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 12)));

     product[1] = _mm_mul_ps(dx[3], dy[3]);

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));

     _mm_store_ps(dest+8,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));


     // ######## Fourth Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));

     dxdy_sum_prev = _mm_add_ps(product[1], shiftfrom_l);
     dxdy_prev = product[1];

     dest += 16;
     */

    // ######## Previous Cell
    float32x4_t product[2] = {vmulq_f32(dx[0], dy[0]), vmulq_f32(dx[1], dy[1])};

    float32x4_t shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(dxdy_prev), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 1)));

    vst1q_f32(dest - 36, vaddq_f32(shiftfrom_r, dxdy_sum_prev));


    // ######## First Cell
    float32x4_t shiftfrom_l =
    vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 3),
        vextq_s32(vreinterpretq_s32_f32(dxdy_prev), vdupq_n_s32(0), 3)));

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));

    vst1q_f32(dest, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));


    // ######## Second Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));

    product[0] = vmulq_f32(dx[2], dy[2]);

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[1]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 1)));

    vst1q_f32(dest + 4, vaddq_f32(vaddq_f32(product[1], shiftfrom_l), shiftfrom_r));


    // ######## Third Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[1]), vdupq_n_s32(0), 3)));

    product[1] = vmulq_f32(dx[3], dy[3]);

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));

    vst1q_f32(dest + 8, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));


    // ######## Fourth Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));

    dxdy_sum_prev = vaddq_f32(product[1], shiftfrom_l);
    dxdy_prev = product[1];

    dest += 16;
    }


    // NOTE(): bracket No.3
    {
    /*
     // ######## Previous Cell
     __m128 product[2] = {
     _mm_mul_ps(dy[0], dy[0]), _mm_mul_ps(dy[1], dy[1])
     };

     __m128 shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dydy_prev), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));

     _mm_store_ps(dest-36, _mm_add_ps(shiftfrom_r, dydy_sum_prev));


     // ######## First Cell
     __m128 shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(dydy_prev), 12)));

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));

     _mm_store_ps(dest,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));


     // ######## Second Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));

     product[0] = _mm_mul_ps(dy[2], dy[2]);

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 12)));

     _mm_store_ps(dest+4,
     _mm_add_ps(_mm_add_ps(product[1], shiftfrom_l), shiftfrom_r));


     // ######## Third Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[1]), 12)));

     product[1] = _mm_mul_ps(dy[3], dy[3]);

     shiftfrom_r = _mm_and_ps(
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 4)),
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 12)));

     _mm_store_ps(dest+8,
     _mm_add_ps(_mm_add_ps(product[0], shiftfrom_l), shiftfrom_r));


     // ######## Fourth Cell
     shiftfrom_l = _mm_and_ps(
     _mm_castsi128_ps(_mm_slli_si128(_mm_castps_si128(product[1]), 4)),
     _mm_castsi128_ps(_mm_srli_si128(_mm_castps_si128(product[0]), 12)));

     dydy_sum_prev = _mm_add_ps(product[1], shiftfrom_l);
     dydy_prev = product[1];

     dest += 16;
     */

    // ######## Previous Cell
    float32x4_t product[2] = {vmulq_f32(dy[0], dy[0]), vmulq_f32(dy[1], dy[1])};

    float32x4_t shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(dydy_prev), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 1)));

    vst1q_f32(dest - 36, vaddq_f32(shiftfrom_r, dydy_sum_prev));


    // ######## First Cell
    float32x4_t shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 3),
        vextq_s32(vreinterpretq_s32_f32(dydy_prev), vdupq_n_s32(0), 3)));

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));

    vst1q_f32(dest, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));


    // ######## Second Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));

    product[0] = vmulq_f32(dy[2], dy[2]);

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[1]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 1)));

    vst1q_f32(dest + 4, vaddq_f32(vaddq_f32(product[1], shiftfrom_l), shiftfrom_r));


    // ######## Third Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[0]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[1]), vdupq_n_s32(0), 3)));

    product[1] = vmulq_f32(dy[3], dy[3]);

    shiftfrom_r = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 1),
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 1)));

    vst1q_f32(dest + 8, vaddq_f32(vaddq_f32(product[0], shiftfrom_l), shiftfrom_r));


    // ######## Fourth Cell
    shiftfrom_l = vreinterpretq_f32_s32(
      vandq_s32(
        vextq_s32(vdupq_n_s32(0), vreinterpretq_s32_f32(product[1]), 3),
        vextq_s32(vreinterpretq_s32_f32(product[0]), vdupq_n_s32(0), 3)));

    dydy_sum_prev = vaddq_f32(product[1], shiftfrom_l);
    dydy_prev = product[1];

    dest += 16;
    }
  }
  const int t1 =
  std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;

  /*
   // Smooth the cov in y with a 3-neighborhood,
   // compute the eigenvalues, and store the lower one to eig
   const __m128 half = _mm_set1_ps(8.0);
   const __m128 full = _mm_set1_ps(16.0);
   const int w3 = w*3;

   __m128 maximum = _mm_set1_ps(-1.0f);
   dest = eigdata+w;
   const float * const covmax = cov + w*(h-1)*3;
   
   for (const float *p = cov + w3; p < covmax; p+=32) {
   for (int i = 0; i < 4; i++, p+=4, dest+=4) {
   const __m128 dxdx = _mm_mul_ps(half, _mm_add_ps(_mm_add_ps(
   _mm_load_ps(p), _mm_load_ps(p-w3)),
   _mm_load_ps(p+w3)));
   const __m128 dxdy = _mm_mul_ps(full, _mm_add_ps(_mm_add_ps(
   _mm_load_ps(p+16), _mm_load_ps(p-w3+16)),
   _mm_load_ps(p+w3+16)));
   const __m128 dydy = _mm_mul_ps(half, _mm_add_ps(_mm_add_ps(
   _mm_load_ps(p+32), _mm_load_ps(p-w3+32)),
   _mm_load_ps(p+w3+32)));
   __m128 t = _mm_sub_ps(dxdx, dydy);
   t = _mm_add_ps(_mm_mul_ps(t, t), _mm_mul_ps(dxdy, dxdy));
   const __m128 eig2 = _mm_sub_ps(_mm_add_ps(dxdx, dydy), _mm_sqrt_ps(t));
   _mm_store_ps(dest, eig2);
   // Update the maximum
   maximum = _mm_max_ps(maximum, eig2);
   }
   }
   */

  float32x4_t half = vdupq_n_f32(8.0f);
  float32x4_t full = vdupq_n_f32(16.0f);
  const int w3 = w * 3;
  float32x4_t maximum = vdupq_n_f32(-1.0f);
  /*
   * NOTE(): float *eigdata = reinterpret_cast<float*>(eig.data);
   * below is to compute the smaller eigen value and store to dest/eigdata/eig
   * */
  dest = eigdata + w;
  const float* const covmax = cov + w * (h - 1) * 3;
  float32x4_t min_dxdx = {900, 900, 900, 900};
  int skip_count = 0;
  for (const float *p = cov + w3; p < covmax; p += 32) {
    for (int i = 0; i < 4; i++, p += 4, dest += 4) {
      float32x4_t dxdx = vmulq_f32(half,
        vaddq_f32(vaddq_f32(vld1q_f32(p), vld1q_f32(p - w3)), vld1q_f32(p + w3)));
      float32x4_t dydy = vmulq_f32(half,
        vaddq_f32(vaddq_f32(vld1q_f32(p + 32), vld1q_f32(p - w3 + 32)), vld1q_f32(p + w3 + 32)));
      // skip if both dxdx dydy are small.
      uint32x4_t small_dydx_flags = vandq_u32(vcltq_f32(dxdx, min_dxdx), vcltq_f32(dydy, min_dxdx));
      bool skip_flag = true;
      /*
       if (p == cov + w3)
       LOG(ERROR)
       << "dxdx[0]" << vgetq_lane_f32(dxdx, 0)
       << " dydy[0] " << vgetq_lane_f32(dydy, 0)
       << " dxdx[1]" << vgetq_lane_f32(dxdx, 1)
       << " dydy[1] " << vgetq_lane_f32(dydy, 1)
       << " dxdx[2]" << vgetq_lane_f32(dxdx, 2)
       << " dydy[2] " << vgetq_lane_f32(dydy, 2)
       << " dxdx[3]" << vgetq_lane_f32(dxdx, 3)
       << " dydy[3] " << vgetq_lane_f32(dydy, 3)
       << " vcltq_f32(dxdx, min_dxdx)[0] " << vgetq_lane_u32(vcltq_f32(dxdx, min_dxdx), 0)
       << " vcltq_f32(dydy, min_dxdx)[0] " << vgetq_lane_u32(vcltq_f32(dydy, min_dxdx), 0)
       << " small_dydx_flags[0] " << vgetq_lane_u32(small_dydx_flags, 0);*/
      // vgetq_lane_u32 returns 0 or 4294967295 (0xffff)
      // https://community.arm.com/thread/9285
      if (vgetq_lane_u32(small_dydx_flags, 0) == static_cast<uint32_t>(0) ||
          vgetq_lane_u32(small_dydx_flags, 1) == static_cast<uint32_t>(0) ||
          vgetq_lane_u32(small_dydx_flags, 2) == static_cast<uint32_t>(0) ||
          vgetq_lane_u32(small_dydx_flags, 3) == static_cast<uint32_t>(0)) {
        skip_flag = false;
      }
      if (skip_flag) {
        skip_count++;
        dest[0] = 0;
        dest[1] = 0;
        dest[2] = 0;
        dest[3] = 0;
        continue;
      }
      float32x4_t dxdy = vmulq_f32(full,
        vaddq_f32(vaddq_f32(vld1q_f32(p + 16), vld1q_f32(p - w3 + 16)), vld1q_f32(p + w3 + 16)));

      /* SHI-TOMASI */
      float32x4_t t = vsubq_f32(dxdx, dydy);
      t = vaddq_f32(vmulq_f32(t, t), vmulq_f32(dxdy, dxdy));
      /*
       NEON implementation of _mm_sqrt_ps:
       - the first try: https://github.com/jratcliff63367/sse2neon/blob/master/SSE2NEON.h#L653
       - the current approximative quadword float inverse square root:
       - https://pmeerw.net/blog/programming/neon1.html
       */
      float32x4_t sqrt_reciprocal = vrsqrteq_f32(t);
      float32x4_t eig2 = vsubq_f32(vaddq_f32(dxdx, dydy),
        t * vrsqrtsq_f32(t * sqrt_reciprocal, sqrt_reciprocal) * sqrt_reciprocal);
      vst1q_f32(dest, eig2);
      // Update the maximum
      maximum = vmaxq_f32(maximum, eig2);
      /* SHI-TOMASI */

      //      /* HARRIS */
      //      float32x4_t det =
      //          vsubq_f32(vmulq_f32(dxdx, dydy), vmulq_f32(dxdy, dxdy));
      //      float32x4_t trace = vaddq_f32(dxdx, dydy);
      //      float32x4_t k_trace_sqr =
      //          vmulq_f32(vdupq_n_f32(harris_k), vmulq_f32(trace, trace));
      //      float32x4_t harris = vsubq_f32(det, k_trace_sqr);
      //      vst1q_f32(dest, harris);
      //      // Update the maximum
      //      maximum = vmaxq_f32(maximum, harris);
      //      /* HARRIS */
    }
  }

  free(cov);
  const int t2 = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;

  // STEP 2: NMS + spatial binning to construct std::vector<cv::Point2f>& points
  cv::Mat tmp;
  const cv::Size imgsize = image.size();
  std::vector<const float*> temp_corners;
  // TODO(xp): verify if border_size is OK with 1
  // TODO(xp): move parameters to config
  const int border_size = 1;
  const float threshold = 1000;
  const float robust_ratio = 0.9;

  cv::dilate(eig, tmp, cv::Mat());   //, cv::Point(-1,-1), 3);
  const int t3 = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;

  //  std::vector<int> eig_ct(5, 0);
  //  std::vector<int> eig_thres(5, 0);
  //  eig_thres[0] = 0;
  //  eig_thres[1] = threshold;
  //  eig_thres[2] = threshold * 2;
  //  eig_thres[3] = threshold * 3;
  //  eig_thres[4] = threshold * 4;
  //  int eig_thres_minus = 0;
  //  collect list of pointers to features - put them into temporary image
  temp_corners.reserve(1000);
  for (int y = border_size; y < imgsize.height - border_size; y++) {
    float * eig_data = reinterpret_cast<float *>(eig.ptr(y));
    float * tmp_data = reinterpret_cast<float *>(tmp.ptr(y));

    for (int x = border_size; x < imgsize.width - border_size; x++) {
      float val = eig_data[x];
      if (val > threshold && val == tmp_data[x]) {
        eig_data[x] = val;
        temp_corners.push_back(eig_data + x);
      }

      //      int tmp = val / threshold;
      //      if (tmp < 0) {
      //        eig_thres_minus++;
      //      } else if (tmp >= 4) {
      //        eig_thres[4]++;
      //      } else {
      //        eig_thres[tmp]++;
      //      }
    }
  }
  const int t4 = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;

  //  LOG(ERROR) << "temp_corner.size(): " << temp_corners.size();
  //  for (int i = 0; i < 5; i++) {
  //    LOG(ERROR) << "eig_thres[" << i << "] - " << eig_thres[i];
  //  }
  //  LOG(ERROR) << "eig_thres_minus: " << eig_thres_minus;
  std::sort(temp_corners.begin(), temp_corners.end(), greaterThanPtr);
  int i, j, total = temp_corners.size(), ncorners = 0;

  // Partition the image into larger grids
  const int cell_size = cvRound(minDistance);
  const int grid_width = (w + cell_size - 1) / cell_size;
  const int grid_height = (h + cell_size - 1) / cell_size;
  std::vector<std::vector<cv::Point2f> > grid(grid_width*grid_height);
  std::vector<std::vector<float> > response_grid(grid_width*grid_height);
  minDistance *= minDistance;

  const int bin_cell_size = 80;
  const int bin_grid_width = (w + bin_cell_size - 1) / bin_cell_size;  // 640 / 80 = 8
  const int bin_grid_height = (h + bin_cell_size - 1) / bin_cell_size;  // 480 / 80 = 6
  // #of features / (8 * 6)
  const int bin_threshold = maxCorners / (bin_grid_width * bin_grid_height);
  std::vector<int> bin_counter((w / bin_cell_size) * (h / bin_cell_size), 0);

  corners.reserve(total);
  for (i = 0; i < total; i++) {
    int ofs = static_cast<int>(
      reinterpret_cast<const unsigned char *>(temp_corners[i]) - eig.data);
    //    LOG(ERROR) << "temp_corners[" << i << "] - " << *temp_corners[i];
    int y = static_cast<int>(ofs / eig.step);
    int x = static_cast<int>((ofs - y*eig.step)/sizeof(float));

    float response = *(temp_corners[i]);

    bool good = true;

    int x_cell = x / cell_size;
    int y_cell = y / cell_size;
    int x_bin_cell = x / bin_cell_size;
    int y_bin_cell = y / bin_cell_size;

    int x1 = x_cell - 1;
    int y1 = y_cell - 1;
    int x2 = x_cell + 1;
    int y2 = y_cell + 1;

    // boundary check
    x1 = std::max(0, x1);
    y1 = std::max(0, y1);
    x2 = std::min(grid_width - 1, x2);
    y2 = std::min(grid_height - 1, y2);

    for (int yy = y1; yy <= y2; yy++) {
      for (int xx = x1; xx <= x2; xx++) {
        std::vector<cv::Point2f> &m = grid[yy*grid_width + xx];
        std::vector<float> &r = response_grid[yy*grid_width + xx];

        if (m.size()) {
          for (j = 0; j < m.size(); j++) {
            float dx = x - m[j].x;
            float dy = y - m[j].y;

            // Compare the ratio of responses, so features are only
            // suppressed if they are significantly weaker
            if (dx*dx + dy*dy < minDistance &&
                response <= robust_ratio * r[j]) {
              good = false;
              goto break_out;
            }
          }
        }
      }
    }

    break_out:

    if (good) {
      if (bin_counter[y_bin_cell * bin_grid_width + x_bin_cell] < bin_threshold) {
        bin_counter[y_bin_cell * bin_grid_width + x_bin_cell]++;
        grid[y_cell*grid_width + x_cell].push_back(
          cv::Point2f(static_cast<float>(x), static_cast<float>(y)));
        response_grid[y_cell*grid_width + x_cell].push_back(response);

        corners.push_back(
                          cv::Point2f(static_cast<float>(x), static_cast<float>(y)));
        ++ncorners;

        if (maxCorners > 0 && static_cast<int>(ncorners) == maxCorners) {
          break;
        }
      }
    }
  }
  const int t5 = std::chrono::duration_cast<std::chrono::microseconds>(
    std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;
  LOG(ERROR) << "goodFeaturesToTrack neon t1-t5: " << t1
  << " " << t2 << " " << t3 << " " << t4 << " " << t5 << " ms";

#else
  LOG(FATAL) << "goodFeaturesToTrack_neon is called without neon support";
#endif
}

// [NOTE] This function does NOT check boundary condition when computing harris response!
void ORBextractor::HarrisResponses_original(const Mat& img,
                                            int blockSize,
                                            float harris_k,
                                            vector<KeyPoint>* pts) {
  CV_Assert(img.type() == CV_8UC1 && blockSize * blockSize <= 2048);

  size_t ptidx, ptsize = pts->size();

  const uchar* ptr00 = img.ptr<uchar>();
  int step = static_cast<int>(img.step / img.elemSize1());
  int r = blockSize / 2;

  float scale = (1 << 2) * blockSize * 255.0f;
  scale = 1.0f / scale;
  float scale_sq_sq = scale * scale * scale * scale;

  AutoBuffer<int> ofsbuf(blockSize*blockSize);
  int* ofs = ofsbuf;
  for (int i = 0; i < blockSize; i++ )
    for (int j = 0; j < blockSize; j++ )
      ofs[i * blockSize + j] = static_cast<int>(i * step + j);

  for (ptidx = 0; ptidx < ptsize; ptidx++) {
    int x0 = cvRound((*pts)[ptidx].pt.x - r);
    int y0 = cvRound((*pts)[ptidx].pt.y - r);

    const uchar* ptr0 = ptr00 + y0 * step + x0;
    int a = 0, b = 0, c = 0;

    for (int k = 0; k < blockSize * blockSize; k++) {
      const uchar* ptr = ptr0 + ofs[k];
      int Ix = (ptr[1] - ptr[-1]) * 2 + (ptr[-step + 1] - ptr[-step - 1])
          + (ptr[step + 1] - ptr[step - 1]);
      int Iy = (ptr[step] - ptr[-step]) * 2 + (ptr[step - 1]
          - ptr[-step - 1]) + (ptr[step + 1] - ptr[-step + 1]);
      a += Ix * Ix;
      b += Iy * Iy;
      c += Ix * Iy;
    }
    (*pts)[ptidx].response = (static_cast<float>(a) * b - static_cast<float>(c) * c -
        harris_k * (static_cast<float>(a) + b) * (static_cast<float>(a) + b)) * scale_sq_sq;
  }
}

// [NOTE] This function does NOT check boundary condition when computing harris response!
void ORBextractor::HarrisResponses(const Mat& img,
                                   int blockSize,
                                   float harris_k,
                                   vector<KeyPoint>* pts) {
#ifdef __ARM_NEON__
  return HarrisResponses_neon(img, blockSize, harris_k, pts);
#endif  // __ARM_NEON__
  CV_Assert(img.type() == CV_8UC1 && blockSize * blockSize <= 2048);

  size_t ptidx, ptsize = pts->size();

  const uchar* ptr00 = img.ptr<uchar>();
  int step = static_cast<int>(img.step / img.elemSize1());
  int r = blockSize / 2;

  float scale = (1 << 2) * blockSize * 255.0f;
  scale = 1.0f / scale;
  float scale_sq_sq = scale * scale * scale * scale;

  AutoBuffer<int> ofsbuf(blockSize*blockSize);
  int* ofs = ofsbuf;
  for (int i = 0; i < blockSize; i++ )
    for (int j = 0; j < blockSize; j++ )
      ofs[i * blockSize + j] = static_cast<int>(i * step + j);

  for (ptidx = 0; ptidx < ptsize; ptidx++) {
    int x0 = cvRound((*pts)[ptidx].pt.x - r);
    int y0 = cvRound((*pts)[ptidx].pt.y - r);

    const uchar* ptr0 = ptr00 + y0 * step + x0;
    int a = 0, b = 0, c = 0;

    for (int k = 0; k < blockSize * blockSize; k++) {
      const uchar* ptr = ptr0 + ofs[k];
      int Ix = (ptr[1] - ptr[-1]) * 2 + (ptr[-step + 1] - ptr[-step - 1])
          + (ptr[step + 1] - ptr[step - 1]);
      int Iy = (ptr[step] - ptr[-step]) * 2 + (ptr[step - 1]
          - ptr[-step - 1]) + (ptr[step + 1] - ptr[-step + 1]);
      a += Ix * Ix;
      b += Iy * Iy;
      c += Ix * Iy;
    }
    (*pts)[ptidx].response = (static_cast<float>(a) * b - static_cast<float>(c) * c -
        harris_k * (static_cast<float>(a) + b) * (static_cast<float>(a) + b)) * scale_sq_sq;
  }
}

static float IC_Angle(const Mat& image, Point2f pt,  const vector<int> & u_max) {
  int m_01 = 0, m_10 = 0;

  const uchar* center = &image.at<uchar> (cvRound(pt.y), cvRound(pt.x));

  // Treat the center line differently, v=0
  for (int u = -HALF_PATCH_SIZE; u <= HALF_PATCH_SIZE; ++u)
    m_10 += u * center[u];

  // Go line by line in the circuI853lar patch
  int step = static_cast<int>(image.step1());
  for (int v = 1; v <= HALF_PATCH_SIZE; ++v) {
    // Proceed over the two lines
    int v_sum = 0;
    int d = u_max[v];
    for (int u = -d; u <= d; ++u) {
      int val_plus = center[u + v * step], val_minus = center[u - v * step];
      v_sum += (val_plus - val_minus);
      m_10 += u * (val_plus + val_minus);
    }
    m_01 += v * v_sum;
  }
  return cv::fastAtan2(static_cast<float>(m_01), static_cast<float>(m_10));
}

static void computeOrbDescriptor(const KeyPoint& kpt,
                                 const Mat& img, const Point* pattern_ptr,
                                 uchar* desc) {
  // previous: calc cos/sin
  // float angle = (float)kpt.angle*factorPI;
  // float a = (float)cos(angle), b = (float)sin(angle);

  // use lookup table
  float angle = static_cast<float>(kpt.angle) * factorPI;
  int angle_deg = static_cast<int>(angle * (180.f / M_PI));
  if (angle_deg < 0) {
    angle_deg += 360;
  }
  const ORBextractor::SinCosAngleVal& sin_cos_lookup = sin_cos_0_360_deg_look_up[angle_deg];
  float b = sin_cos_lookup.sin_val;
  float a = sin_cos_lookup.cos_val;

  const uchar* center = &img.at<uchar>(cvRound(kpt.pt.y), cvRound(kpt.pt.x));
  const int step = static_cast<int>(img.step);

  // static_cast<int> used to be cvRound
#define GET_VALUE(idx) \
center[static_cast<int>(pattern_ptr[idx].x * b + pattern_ptr[idx].y * a) * step + \
static_cast<int>(pattern_ptr[idx].x * a - pattern_ptr[idx].y * b)]

  int t0;
  int t1;
  int val;
  for (int i = 0; i < 32; ++i, pattern_ptr += 16) {
    t0 = GET_VALUE(0); t1 = GET_VALUE(1);
    val = t0 < t1;
    t0 = GET_VALUE(2); t1 = GET_VALUE(3);
    val |= (t0 < t1) << 1;
    t0 = GET_VALUE(4); t1 = GET_VALUE(5);
    val |= (t0 < t1) << 2;
    t0 = GET_VALUE(6); t1 = GET_VALUE(7);
    val |= (t0 < t1) << 3;
    t0 = GET_VALUE(8); t1 = GET_VALUE(9);
    val |= (t0 < t1) << 4;
    t0 = GET_VALUE(10); t1 = GET_VALUE(11);
    val |= (t0 < t1) << 5;
    t0 = GET_VALUE(12); t1 = GET_VALUE(13);
    val |= (t0 < t1) << 6;
    t0 = GET_VALUE(14); t1 = GET_VALUE(15);
    val |= (t0 < t1) << 7;

    desc[i] = (uchar)val;
  }

#undef GET_VALUE
}

void computeOrbDescriptor_neon512(const KeyPoint& kpt,
                                  const Mat& img,
                                  uchar* desc) {
#ifdef __ARM_NEON__
  const float angle = static_cast<float>(kpt.angle) * factorPI;
  int angle_deg = static_cast<int>(angle / M_PI * 180.f);
  if (angle_deg < 0) {
    angle_deg += 360;
  }
  CHECK_LE(angle_deg, 360);
  CHECK_GE(angle_deg, 0);

  const ORBextractor::SinCosAngleVal& sin_cos_lookup = sin_cos_0_360_deg_look_up[angle_deg];
  const float sin_angle = sin_cos_lookup.sin_val;
  const float cos_angle = sin_cos_lookup.cos_val;
  const int row = cvRound(kpt.pt.y);
  const int col = cvRound(kpt.pt.x);
  const int32_t step = static_cast<int32_t>(img.step1());
  CHECK_GT(row, 15);
  CHECK_GT(col, 15);
  CHECK_LT(row, img.rows - 15);
  CHECK_LT(col, img.cols - 15);
  const uchar* center = img.data + row * step + col;
  int t[16];
  for (int i = 0; i < 32; ++i) {
    for (int j = 0; j < 16; j+= 4) {
      float32x4_t vecX = vld1q_f32(bit_pattern_31_x_ + j + i * 16);
      float32x4_t vecY = vld1q_f32(bit_pattern_31_y_ + j + i * 16);
      int32x4_t vecSub_int = vcvtq_s32_f32(vmlsq_n_f32(
        vmulq_n_f32(vecX, cos_angle), vecY, sin_angle));
      int32x4_t vecAdd_int = vcvtq_s32_f32(vmlaq_n_f32(
        vmulq_n_f32(vecX, sin_angle), vecY, cos_angle));
      int32x4_t vecAdd_int_times_step_plus_col = vmlaq_n_s32(vecSub_int, vecAdd_int, step);
      t[0 + j] = center[vgetq_lane_s32(vecAdd_int_times_step_plus_col, 0)];
      t[1 + j] = center[vgetq_lane_s32(vecAdd_int_times_step_plus_col, 1)];
      t[2 + j] = center[vgetq_lane_s32(vecAdd_int_times_step_plus_col, 2)];
      t[3 + j] = center[vgetq_lane_s32(vecAdd_int_times_step_plus_col, 3)];
    }
    int val =
    (t[0] < t[1])
    | ((t[2] < t[3]) << 1)
    | ((t[4] < t[5]) << 2)
    | ((t[6] < t[7]) << 3)
    | ((t[8] < t[9]) << 4)
    | ((t[10] < t[11]) << 5)
    | ((t[12] < t[13]) << 6)
    | ((t[14] < t[15]) << 7);

    desc[i] = (uchar)val;
  }
#else
  LOG(FATAL) << "computeOrbDescriptor_neon512 is called without neon support";
#endif
}

ORBextractor::ORBextractor(int _nfeatures, float _scaleFactor, int _nlevels, int _scoreType,
                           int _fastTh, bool use_fast):
    nfeatures(_nfeatures), scaleFactor(_scaleFactor), nlevels(_nlevels),
    scoreType(_scoreType), fastTh(_fastTh), use_fast_(use_fast) {
  mvScaleFactor.resize(nlevels);
  mvScaleFactor[0] = 1;
  for (int i = 1; i < nlevels; i++)
    mvScaleFactor[i] = mvScaleFactor[i-1] * scaleFactor;

  float invScaleFactor = 1.0f / scaleFactor;
  mvInvScaleFactor.resize(nlevels);
  mvInvScaleFactor[0] = 1;
  for (int i = 1; i < nlevels; i++)
    mvInvScaleFactor[i] = mvInvScaleFactor[i-1] * invScaleFactor;

  mvImagePyramid.resize(nlevels);
  mvMaskPyramid.resize(nlevels);

  mnFeaturesPerLevel.resize(nlevels);
  float factor = static_cast<float>(1.0 / scaleFactor);
  float nDesiredFeaturesPerScale = nfeatures*(1.0f - factor)
      / (1.0f - static_cast<float>(pow(static_cast<double>(factor),
                                       static_cast<double>(nlevels))));

  int sumFeatures = 0;
  for (int level = 0; level < nlevels-1; level++) {
    mnFeaturesPerLevel[level] = cvRound(nDesiredFeaturesPerScale);
    sumFeatures += mnFeaturesPerLevel[level];
    nDesiredFeaturesPerScale *= factor;
  }
  mnFeaturesPerLevel[nlevels-1] = std::max(nfeatures - sumFeatures, 0);

  // This is for orientation
  // pre-compute the end of a row in a circular patch
  umax.resize(HALF_PATCH_SIZE + 1);

  int v, v0, vmax = cvFloor(HALF_PATCH_SIZE * sqrt(2.f) / 2 + 1);
  int vmin = cvCeil(HALF_PATCH_SIZE * sqrt(2.f) / 2);
  const double hp2 = HALF_PATCH_SIZE*HALF_PATCH_SIZE;
  for (v = 0; v <= vmax; ++v)
    umax[v] = cvRound(sqrt(hp2 - v * v));

  // Make sure we are symmetric
  for (v = HALF_PATCH_SIZE, v0 = 0; v >= vmin; --v) {
    while (umax[v0] == umax[v0 + 1]) ++v0;
    umax[v] = v0;
    ++v0;
  }
}

void ORBextractor::computeOrientation(const Mat& image,
                                      const vector<int>& umax,
                                      vector<KeyPoint>* keypoints) {
  for (vector<KeyPoint>::iterator keypoint = keypoints->begin(),
           keypointEnd = keypoints->end(); keypoint != keypointEnd; ++keypoint) {
    keypoint->angle = IC_Angle(image, keypoint->pt, umax);
  }
}

void ORBextractor::ComputeKeyPoints(vector<vector<KeyPoint> >* allKeypoints) {
  allKeypoints->resize(nlevels);
  // make cell sqaure
  float imageRatio = static_cast<float>(mvImagePyramid[0].rows) / mvImagePyramid[0].cols;

  for (int level = 0; level < nlevels; ++level) {
    const int nDesiredFeatures = mnFeaturesPerLevel[level];
    const int minBorderX = EDGE_THRESHOLD;
    const int minBorderY = minBorderX;
    const int maxBorderX = mvImagePyramid[level].cols-EDGE_THRESHOLD;
    const int maxBorderY = mvImagePyramid[level].rows-EDGE_THRESHOLD;

    const int W = maxBorderX - minBorderX;
    const int H = maxBorderY - minBorderY;

    // const int levelCols = sqrt((float)nDesiredFeatures/(5*imageRatio));
    // const int levelRows = imageRatio*levelCols;

    // const int cellW = ceil((float)W/levelCols);
    // const int cellH = ceil((float)H/levelRows);
    const int cellW = 25;
    const int cellH = 25;

    const int levelCols = ceil(static_cast<float>(W) / cellW);
    const int levelRows = ceil(static_cast<float>(H) / cellH);


    // std::cout << " cellW " << cellW << " levelCols " << levelCols << std::endl;
    // std::cout << " cellH " << cellH << " levelRows " << levelRows << std::endl;
    const int nCells = levelRows*levelCols;
    const int nfeaturesCell = ceil(static_cast<float>(nDesiredFeatures) / nCells);

    vector<vector<vector<KeyPoint> > > cellKeyPoints(levelRows,
                                                     vector<vector<KeyPoint> >(levelCols));

    vector<vector<int> > nToRetain(levelRows, vector<int>(levelCols));
    vector<vector<int> > nTotal(levelRows, vector<int>(levelCols));
    vector<vector<bool> > bNoMore(levelRows, vector<bool>(levelCols, false));
    vector<int> iniXCol(levelCols);
    vector<int> iniYRow(levelRows);
    int nNoMore = 0;
    int nToDistribute = 0;

    for (int i = 0; i < levelRows; i++) {
      const float iniY = minBorderY + i * cellH - 3;
      iniYRow[i] = iniY;
      // The min x y returned by opencv fast det is 3.
      float hY = cellH + 6;

      if (i == levelRows-1) {
        hY = maxBorderY + 3 - iniY;
        if (hY <= 0) continue;
      }

      for (int j = 0; j < levelCols; j++) {
        float iniX;
        if (i == 0) {
          iniX = minBorderX + j * cellW - 3;
          iniXCol[j] = iniX;
        } else {
          iniX = iniXCol[j];
        }
        float hX = cellW + 6;
        if (j == levelCols-1) {
          hX = maxBorderX + 3 - iniX;
          if (hX <= 0) continue;
        }
        /*
        std::cout << "i " << i << " j " << j
        << " iniY " << iniY << " iniX " << iniX
        << " hY " << hY << " hX " << hX << std::endl;
        */
        Mat cellImage = mvImagePyramid[level].rowRange(iniY, iniY + hY).colRange(iniX, iniX + hX);

        Mat cellMask;
        if (!mvMaskPyramid[level].empty())
          cellMask = cv::Mat(mvMaskPyramid[level], Rect(iniX, iniY, hX, hY));
        cellKeyPoints[i][j].reserve(nfeaturesCell * 5);
        // std::cout << " cellMask " << cellMask << std::endl;
        // std::cout << " cellImage " << cellImage << std::endl;
        cv::Ptr<FastFeatureDetector> fd = cv::FastFeatureDetector::create(fastTh, true);
        fd->detect(cellImage, cellKeyPoints[i][j], cellMask);
        /*
        if(cellKeyPoints[i][j].size()<=3) {
          cellKeyPoints[i][j].clear();
          cv::Ptr<FastFeatureDetector> fd = cv::FastFeatureDetector::create(7, true);
          fd->detect(cellImage, cellKeyPoints[i][j], cellMask);
        }*/
        /*
        for(const auto& kp : cellKeyPoints[i][j]) {
          std::cout << "kp.pt " << kp.pt << " response " << kp.response
          << " iniX / Y " << iniX << " / " << iniY
          << " cellImage.size " << cellImage.size() << std::endl;
        }*/

        if (scoreType == ORB::HARRIS_SCORE) {
          // Compute the Harris cornerness
          HarrisResponses(cellImage, 7, HARRIS_K, &cellKeyPoints[i][j]);
        }

        const int nKeys = cellKeyPoints[i][j].size();
        nTotal[i][j] = nKeys;

        if (nKeys > nfeaturesCell) {
          nToRetain[i][j] = nfeaturesCell;
          bNoMore[i][j] = false;
        } else {
          nToRetain[i][j] = nKeys;
          nToDistribute += nfeaturesCell - nKeys;
          bNoMore[i][j] = true;
          nNoMore++;
        }
      }
    }

    // Retain by score
    while (nToDistribute > 0 && nNoMore < nCells) {
      int nNewFeaturesCell = nfeaturesCell
          + ceil(static_cast<float>(nToDistribute) / (nCells - nNoMore));
      nToDistribute = 0;

      for (int i = 0; i < levelRows; i++) {
        for (int j = 0; j < levelCols; j++) {
          if (!bNoMore[i][j]) {
            if (nTotal[i][j] > nNewFeaturesCell) {
              nToRetain[i][j] = nNewFeaturesCell;
              bNoMore[i][j] = false;
            } else {
              nToRetain[i][j] = nTotal[i][j];
              nToDistribute += nNewFeaturesCell - nTotal[i][j];
              bNoMore[i][j] = true;
              nNoMore++;
            }
          }
        }
      }
    }

    vector<KeyPoint> & keypoints = (*allKeypoints)[level];
    keypoints.reserve(nDesiredFeatures * 2);

    const int scaledPatchSize = PATCH_SIZE * mvScaleFactor[level];

    // Retain by score and transform coordinates
    for (int i = 0; i < levelRows; i++) {
      for (int j = 0; j < levelCols; j++) {
        vector<KeyPoint> &keysCell = cellKeyPoints[i][j];
        KeyPointsFilter::retainBest(keysCell, nToRetain[i][j]);
        if (static_cast<int>(keysCell.size()) > nToRetain[i][j])
          keysCell.resize(nToRetain[i][j]);

        for (size_t k = 0, kend = keysCell.size(); k < kend; k++) {
          keysCell[k].pt.x += iniXCol[j];
          keysCell[k].pt.y += iniYRow[i];
          keysCell[k].octave = level;
          keysCell[k].size = scaledPatchSize;
          keypoints.push_back(keysCell[k]);
        }
      }
    }
    if (static_cast<int>(keypoints.size()) > nDesiredFeatures) {
      KeyPointsFilter::retainBest(keypoints, nDesiredFeatures);
      keypoints.resize(nDesiredFeatures);
    }
  }

  // and compute orientations
  for (int level = 0; level < nlevels; ++level) {
    computeOrientation(mvImagePyramid[level], umax, &((*allKeypoints)[level]));
  }
}

void ORBextractor::computeDescriptors(const Mat& image,
                                      const vector<KeyPoint>& keypoints, Mat* descriptors) {
  if (keypoints.size() > 0) {
    if (!descriptors->empty()) {
      CHECK_EQ(descriptors->rows, keypoints.size());
      CHECK_EQ(descriptors->cols, 32);
      CHECK_EQ(descriptors->type(), CV_8U);
    } else {
      descriptors->create(keypoints.size(), 32, CV_8U);
    }
    // descriptors.setTo(0);
    const Point* pattern_ptr = (const Point*)(bit_pattern_31_);
    for (size_t i = 0; i < keypoints.size(); i++) {
      // LOG(ERROR) << i << ":" << keypoints[i].pt;
      computeOrbDescriptor(keypoints[i], image, pattern_ptr, descriptors->ptr(static_cast<int>(i)));
    }
  }
}

void ORBextractor::computeDescriptorsN512(const Mat& image,
                                          const vector<KeyPoint>& keypoints, Mat* descriptors) {
  if (keypoints.size() > 0) {
    if (!descriptors->empty()) {
      CHECK_EQ(descriptors->rows, keypoints.size());
      CHECK_EQ(descriptors->cols, 32);
      CHECK_EQ(descriptors->type(), CV_8U);
    } else {
      descriptors->create(keypoints.size(), 32, CV_8U);
    }
    // descriptors.setTo(0);
    for (size_t i = 0; i < keypoints.size(); i++) {
      computeOrbDescriptor_neon512(keypoints[i], image, descriptors->data + i * 32);
    }
  }
}

void ORBextractor::detect(const cv::Mat& image,
                          const cv::Mat& mask,
                          std::vector<cv::KeyPoint>* keypoints,
                          cv::Mat* descriptors) {
  if (image.empty())
    return;
  assert(image.type() == CV_8UC1);

  // Pre-compute the scale pyramids
  auto start_time = std::chrono::high_resolution_clock::now();
  ComputePyramid(image, mask);

  vector<vector<KeyPoint> > allKeypoints;

  auto start_time_goodFeaturesToTrack = std::chrono::high_resolution_clock::now();

  if (use_fast_) {
    // perform ORB-SLAM version FAST detection
    ComputeKeyPoints(&allKeypoints);
  } else {
    for (int it_level = 0; it_level < nlevels; it_level++) {
      vector<cv::Point2f> points_this_level;
      // perform OpenCV NEON version SHI-TOMASI detection
      // cv::goodFeaturesToTrack(mvImagePyramid[it_level],
      // points_this_level, nfeatures >> it_level, 0.01, 4);
#ifndef __ARM_NEON__
      // perform OpenCV desktop version SHI-TOMASI detection
      cv::goodFeaturesToTrack(mvImagePyramid[it_level],
                              points_this_level, nfeatures >> it_level, 0.01, 4);
#else
      // perform our own NEON version SHI-TOMASI detection
      goodFeaturesToTrack_neon(mvImagePyramid[it_level],
                               points_this_level, nfeatures >> it_level, 0.01, 4);
#endif
      vector<KeyPoint> keypoints_this_level;
      keypoints_this_level.reserve(points_this_level.size());
      for (int i = 0; i < points_this_level.size(); i++) {
        cv::KeyPoint tmp;
        tmp.pt = points_this_level[i];
        tmp.octave = it_level;
        keypoints_this_level.push_back(tmp);
      }
      computeOrientation(mvImagePyramid[it_level], umax, &keypoints_this_level);
      // scale up the points later
      allKeypoints.push_back(keypoints_this_level);
    }
  }
#ifndef __ARM_NEON__
  const int total_time_ms_detect_features =
      std::chrono::duration_cast<std::chrono::microseconds>(
          std::chrono::high_resolution_clock::now() - start_time_goodFeaturesToTrack).count() / 1000;
  VLOG(1) << "TOTAL feature detection: " << total_time_ms_detect_features << " ms";
#endif

  int nkeypoints = 0;
  for (int level = 0; level < nlevels; ++level)
    nkeypoints += static_cast<int>(allKeypoints[level].size());
  if (nkeypoints > 0) {
    if (descriptors != nullptr) {
      descriptors->create(nkeypoints, 32, CV_8U);
    }
  }

  keypoints->clear();
  keypoints->reserve(nkeypoints);

  int offset = 0;
  for (int level = 0; level < nlevels; ++level) {
    vector<KeyPoint>& keypointsLevel = allKeypoints[level];
    int nkeypointsLevel = static_cast<int>(keypointsLevel.size());

    if (nkeypointsLevel == 0)
      continue;

    // preprocess the resized image
    Mat& workingMat = mvImagePyramid[level];
    GaussianBlur(workingMat, workingMat, Size(7, 7), 2, 2, BORDER_REFLECT_101);

    // Compute the descriptors if requested
    if (descriptors != nullptr) {
      auto descriptor_start = std::chrono::high_resolution_clock::now();
      Mat desc = descriptors->rowRange(offset, offset + nkeypointsLevel);
#ifndef __ARM_NEON__
      computeDescriptors(workingMat, keypointsLevel, &desc);
#else
      computeDescriptorsN512(workingMat, keypointsLevel, &desc);
#endif
      VLOG(1) << "Descriptors: "
              << std::chrono::duration_cast<std::chrono::milliseconds>(
                  std::chrono::high_resolution_clock::now() - descriptor_start).count()
              << " ms" << std::endl;
    }
    offset += nkeypointsLevel;
    // Scale keypoint coordinates
    if (level != 0) {
      float scale = mvScaleFactor[level];  // getScale(level, firstLevel, scaleFactor);
      for (vector<KeyPoint>::iterator keypoint = keypointsLevel.begin(),
               keypointEnd = keypointsLevel.end(); keypoint != keypointEnd; ++keypoint)
        keypoint->pt *= scale;
    }
    // And add the keypoints to the output
    keypoints->insert(keypoints->end(), keypointsLevel.begin(), keypointsLevel.end());
  }
  const int total_time_ms = std::chrono::duration_cast<std::chrono::microseconds>(
      std::chrono::high_resolution_clock::now() - start_time).count() / 1000;
  if (total_time_ms > 50) {
    LOG(ERROR) << "TOTAL feature processing too long: " << total_time_ms
               << " ms. Feat # " << nkeypoints;
  }
}

void ORBextractor::ComputePyramid(const cv::Mat& image, cv::Mat Mask) {
  for (int level = 0; level < nlevels; ++level) {
    float scale = mvInvScaleFactor[level];
    Size sz(cvRound(static_cast<float>(image.cols) * scale),
            cvRound(static_cast<float>(image.rows) * scale));
    Size wholeSize(sz.width + EDGE_THRESHOLD * 2, sz.height + EDGE_THRESHOLD * 2);
    Mat temp(wholeSize, image.type()), masktemp;
    mvImagePyramid[level] = temp(Rect(EDGE_THRESHOLD, EDGE_THRESHOLD, sz.width, sz.height));

    if (!Mask.empty()) {
      // masktemp = Mat(wholeSize, Mask.type());
      // mvMaskPyramid[level] = masktemp(Rect(EDGE_THRESHOLD, EDGE_THRESHOLD, sz.width, sz.height));
      cv::Mat resized_mask(sz, Mask.type());
      resize(Mask, resized_mask, sz, 0, 0, INTER_NEAREST);
      resized_mask.copyTo(mvMaskPyramid[level]);
    }

    // Compute the resized image
    if (level != 0) {
      resize(mvImagePyramid[level-1], mvImagePyramid[level], sz, 0, 0, INTER_LINEAR);
      // if (!Mask.empty()) {
      //   resize(mvMaskPyramid[level-1], mvMaskPyramid[level], sz, 0, 0, INTER_NEAREST);
      // }

      copyMakeBorder(mvImagePyramid[level], temp, EDGE_THRESHOLD,
                     EDGE_THRESHOLD, EDGE_THRESHOLD, EDGE_THRESHOLD,
                     BORDER_REFLECT_101+BORDER_ISOLATED);
      // if (!Mask.empty())
      //   copyMakeBorder(mvMaskPyramid[level], masktemp, EDGE_THRESHOLD,
      //     EDGE_THRESHOLD, EDGE_THRESHOLD, EDGE_THRESHOLD,
      //     BORDER_CONSTANT+BORDER_ISOLATED);
    } else {
      copyMakeBorder(image, temp, EDGE_THRESHOLD, EDGE_THRESHOLD,
                     EDGE_THRESHOLD, EDGE_THRESHOLD, BORDER_REFLECT_101);
      // if(!Mask.empty())
      //   copyMakeBorder(Mask, masktemp, EDGE_THRESHOLD, EDGE_THRESHOLD,
      //     EDGE_THRESHOLD, EDGE_THRESHOLD, BORDER_CONSTANT+BORDER_ISOLATED);
    }
  }
}

// Util helper function to allow external access to sin_cos_0_360_deg_look_up
ORBextractor::SinCosAngleVal ORBextractor::get_sin_cos_0_360_deg_look_up(int i) {
  return sin_cos_0_360_deg_look_up[i];
}

float ORBextractor::travel_sin_cos_lookup_table(const std::vector<int>& angle) {
  float total = 0.f;
  for (int i = 0, size = angle.size(); i < size; ++i) {
    const SinCosAngleVal& tmp = sin_cos_0_360_deg_look_up[angle[i]];
    total += tmp.sin_val + tmp.cos_val;
  }
  return total;
}
